In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_data = pd.read_csv("/kaggle/input/mobile-price-classification/train.csv")
test_data = pd.read_csv("/kaggle/input/mobile-price-classification/test.csv")

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
train_data.isna().sum()

### No missing data on the training data

### price_range is our dependant variable and the rest of the features are independant variables.

### The following features are numerical variables and must be standardised before we can train our predictive model

In [None]:
train_data.describe()

In [None]:
from tqdm.auto import tqdm
categorical_columns = []
numerical_columns = []
for c in tqdm(train_data.columns,total=len(train_data.columns)):
    if(len(train_data[c].value_counts())<=10):
        categorical_columns.append(c)
    else:
        numerical_columns.append(c)

### First let us try and do a classification without any data standardization and assuming all the data as numeric in nature

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
y = train_data["price_range"]
X = train_data.drop(columns=["price_range"],axis=1)

In [None]:
def calculate_metrics(X,y):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
    logistic = LogisticRegression()
    logistic.fit(X_train,y_train)

    y_pred = logistic.predict(X_test)

    
    print(classification_report(y_test,y_pred))

In [None]:
calculate_metrics(X,y)

### We achieved 63% accuracy without transforming and standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
def transform_numeric_features(train_data,numerical_columns):
    for col in numerical_columns:
        X = np.array(train_data[col]).reshape(-1,1)
        train_data[col] = scaler.fit_transform(X)
    return train_data

In [None]:
train_data_transformed = transform_numeric_features(train_data,numerical_columns)

In [None]:
y_new = train_data_transformed["price_range"]
X_new = train_data_transformed.drop(columns=["price_range"],axis=1)

In [None]:
categorical_columns.remove("price_range")
X_new = pd.get_dummies(X_new,columns=categorical_columns,prefix_sep="_")

In [None]:
calculate_metrics(X_new,y_new)

### Accuracy increased from 63% to 96% when we standardized and transformed the data for our analysis. This is the power of Data Wrangling.

### Now let us only consider relevant features in our analysis and see how the accuracy gets affected.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
top_10_important_features = featureScores.nlargest(10,'Score')["Specs"].values

In [None]:
X3 = train_data[top_10_important_features]
y3 = train_data["price_range"]

In [None]:
from tqdm.auto import tqdm
categorical_columns1 = []
numerical_columns1 = []
for c in tqdm(X3.columns,total=len(X3.columns)):
    if(len(X3[c].value_counts())<=10):
        categorical_columns1.append(c)
    else:
        numerical_columns1.append(c)

In [None]:
X3 = transform_numeric_features(X3,numerical_columns1)

In [None]:
X3 = pd.get_dummies(X3,columns=categorical_columns1,prefix_sep="_")

In [None]:
calculate_metrics(X3,y3)

### When we used top 10 important features then our accuracy increases from 96% to 98%