# Loading data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/mobile-price-classification/train.csv')
test = pd.read_csv('../input/mobile-price-classification/test.csv',index_col='id')
train.head()

In [None]:
test.head()

In [None]:
test.shape, train.shape

In [None]:
train.info()

In [None]:
test.info()

Looks like there are no null values in any of the sets but there can be 0s in some columns which can actually be missing data.

In [None]:
train.describe()

In [None]:
test.describe()

px_height, sc_w has minimum value of 0, which is of course missing values, let's mark them as nan to make things easier

In [None]:
train.loc[train['px_height'] == 0,'px_height'] = np.nan
train.loc[train['sc_w'] == 0,'sc_w'] = np.nan

test.loc[test['px_height'] == 0,'px_height'] = np.nan
test.loc[test['sc_w'] == 0,'sc_w'] = np.nan

In [None]:
value_counts = train['price_range'].value_counts()
plt.pie(value_counts.values, labels = value_counts.index, autopct='%1.1f%%', startangle=90)
plt.show()

Dataset is balanced, which makes things easier, let's look at distribution of features with regard to class

In [None]:
fig, ax = plt.subplots(5, 4,figsize=(19,19))
for i, col in enumerate(train.iloc[:,:-1]):
    sns.histplot(x=col, hue = 'price_range', data=train,ax =ax[i//4][i%4], multiple = 'stack')

Looks like other the main characteristics of a mobile phone is its ram, distribution of price range says so. Other key features which makes visible difference are battery_power, px_width, px_height. Let's look at correlation matrix

In [None]:
plt.figure(figsize=(16,16))
mat = train.corr()
sns.heatmap(mat, annot=True)

 + Correlation between ram and price_range is too high as expected. 
 + Screen Height and Screen Width correlation is understandable as phones are created at similar screen ratios. Same thing goes for pixel height and pixel width.
 + There is a little correlation between price range and battery power, pixel height, pixel width.
 + Contrary to my assumptions clock speed and internal memory has no correlation with price range
 + Correlation between 3G and 4G is also high
 + One last thing, primary camera and front camera has similar correlation. 
 + The other columns have too little or no correlation between them. I will train model with and without them and compare results.
 
 Let's fill missing values with median

In [None]:
train['px_height'].fillna(train['px_height'].median(), inplace=True)
test['px_height'].fillna(test['px_height'].median(), inplace=True)

train['sc_w'].fillna(train['sc_w'].median(), inplace=True)
test['sc_w'].fillna(test['sc_w'].median(), inplace=True)

Extracting "main" columns for further use

In [None]:
main_cols = ['battery_power', 'px_height','px_width', 'ram']
train_main = train[main_cols+['price_range']]
test_main = test[main_cols]

Front camera and primary camera having 0 megapixels means phone doesn't have camera at that side, we can create two extra features from that.

In [None]:
train['has_pc'] = (train['pc'] != 0).astype(np.int8)
test['has_pc'] = (test['pc'] != 0).astype(np.int8)

train['has_fc'] = (train['fc'] != 0).astype(np.int8)
test['has_fc'] = (test['fc'] != 0).astype(np.int8)

Screen area and pixel count (`px_width * px_height`) can also be useful

In [None]:
train['px_count'] = train['px_width']*train['px_height']
test['px_count'] = test['px_width']*test['px_height']

train['sc_area'] = train['sc_w']*train['sc_h']
test['sc_area'] = test['sc_w']*test['sc_h']

train['dp'] = train['px_count']/train['sc_area'] 
test['dp'] = test['px_count']/test['sc_area']

train['sc_ratio'] = train['sc_h']/train['sc_w']
test['sc_ratio'] = test['sc_h']/test['sc_w']

train['px_ratio'] = train['px_height']/train['px_width']
test['px_ratio'] = test['px_height']/test['px_width']

In [None]:
plt.figure(figsize=(16,16))
mat = train[['px_ratio','sc_area','sc_ratio', 'px_count','has_pc','has_fc','dp','price_range']].corr()
sns.heatmap(mat, annot=True)

Looks like new features won't do anything too important, but let's keep them where they are

# Modelling

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split


In [None]:
X_full = train.drop(['price_range'], axis = 1)
y_full = train['price_range']

There are categorical variables in data but they can have only 2 values (True, False), so there is no need for one-hot encoding. Linear models work better with scaled values, so we will scale them. But we have to be careful not to scale categorical columns

In [None]:
cat_cols = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi','has_pc', 'has_fc']
num_cols =  list(X_full.drop(cat_cols, axis = 1).columns)


scaler = StandardScaler()
X_full[num_cols] = scaler.fit_transform(X_full[num_cols])
test[num_cols] = scaler.transform(test[num_cols])


Since it's classification task, we should split data evenly, so we pass y_full as stratify

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full,stratify=y_full, random_state=42,test_size=0.25)

In [None]:
models = [LogisticRegression(),RandomForestClassifier(),DecisionTreeClassifier(), KNeighborsClassifier()]
scores = []
model_names = []
for model in models:
    scores.append(np.mean(cross_val_score(model, X_train, y_train, n_jobs=3, verbose=2, cv=5)))
    model_names.append(model.__class__.__name__)

In [None]:
plt.figure(figsize=(10,8))
g=sns.barplot(model_names, scores)
ax=g
for p in ax.patches:
             ax.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=11,  xytext=(0, 10),
                 textcoords='offset points')

Let's search for best parameters for Logistic Regression and Random Forest Classifier

In [None]:
params = [{'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']    ,
            'penalty':['none'],
            },
          { 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
           'penalty':['l1'],
              'C':[0.001,0.003,0.01,0.03,0.1,0.3,1,3,10,30,100]},
          {'solver':['liblinear'],
           'penalty':['l1','l2'],
            'C':[0.001,0.003,0.01,0.03,0.1,0.3,1,3,10,30,100]}
         ]
log_reg_cv = GridSearchCV(LogisticRegression(max_iter=10000), param_grid=params, cv = 5, verbose = 2, n_jobs = -1).fit(X_train, y_train)

In [None]:
log_reg_model = log_reg_cv.best_estimator_
log_reg_model.score(X_val, y_val)

In [None]:
params = {'n_estimators':np.arange(100,2001,100),
         'max_depth':np.arange(3,31,2)}
rfc_cv = GridSearchCV(RandomForestClassifier(), param_grid = params, cv =5, verbose = 2, n_jobs = -1).fit(X_train, y_train)
rfc_cv.best_score_

Random Forest Classifier didn't show any significant improvement, but Logistic Regression gets 97.8% accuracy on the validation set which I think is pretty good. Let's look at confusion matrix

In [None]:
log_reg_model = log_reg_cv.best_estimator_
y_pred = log_reg_model.predict(X_val)
mat = confusion_matrix(y_val, y_pred)
sns.heatmap(mat, annot=True,fmt='1')

It misclassified only 13 mobile phones wrong, I think it is good enough to predict test dataset 

In [None]:
log_reg_model.fit(X_full, y_full)
test_pred = log_reg_model.predict(test)

In [None]:
submission = pd.DataFrame({'id':test.index,
                          'class':test_pred})
submission.to_csv('submission.csv', index=False)

# Main columns

I wrote that I will check the main (columns with high correlation with target) features. Let's do that

In [None]:
train_main.shape, test_main.shape

We will repeat some steps from before

In [None]:
train_main['px_count'] = train_main['px_width']*train_main['px_height']
test_main['px_count'] = test_main['px_width']*test_main['px_height']
train_main['px_ratio'] = train_main['px_height']/train_main['px_width']
test_main['px_ratio'] = test_main['px_height']/test_main['px_width']

There are no categorical variable's in this "new" dataset, so we can scale dataset easily

In [None]:
X_main_full = train_main.drop(['price_range'],axis = 1)
y_main_full = train_main['price_range']

scaler = StandardScaler()
X_main_full = scaler.fit_transform(X_main_full)
test_main = scaler.fit_transform(test_main)

X_main_train, X_main_val, y_main_train, y_main_val = train_test_split(X_main_full, y_main_full,stratify=y_main_full, random_state=42,test_size=0.25)

In [None]:
models = [LogisticRegression(),RandomForestClassifier(),DecisionTreeClassifier(), KNeighborsClassifier()]
scores = []
model_names = []
for model in models:
    scores.append(np.mean(cross_val_score(model, X_main_train, y_main_train, n_jobs=3, verbose=2, cv=5)))
    model_names.append(model.__class__.__name__)
    

In [None]:
plt.figure(figsize=(10,8))
g=sns.barplot(model_names, scores)
ax=g
for p in ax.patches:
             ax.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=11,  xytext=(0, 10),
                 textcoords='offset points')

There has been great improvement in KNeighborsClassifier. Let's fine-tune it. It would be better if I used knife method, but now I just want to decide if KNeighborsClassifier can come near to LogisticRegression.

In [None]:
params = {'n_neighbors':range(1,20)}
knn_cv = GridSearchCV(KNeighborsClassifier(),param_grid = params, cv =5, verbose = 2, n_jobs = -1).fit(X_main_train, y_main_train)
print("Best score:",knn_cv.best_score_)

Looks like not

In [None]:
params = [{'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']    ,
            'penalty':['none'],
            },
          { 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
           'penalty':['l1'],
              'C':[0.001,0.003,0.01,0.03,0.1,0.3,1,3,10,30,100]},
          {'solver':['liblinear'],
           'penalty':['l1','l2'],
            'C':[0.001,0.003,0.01,0.03,0.1,0.3,1,3,10,30,100]}
         ]
log_reg_cv = GridSearchCV(LogisticRegression(max_iter=10000), param_grid=params, cv = 5, verbose = 2, n_jobs = -1).fit(X_main_train, y_main_train)

In [None]:
log_reg_main_model = log_reg_cv.best_estimator_
log_reg_main_model.score(X_main_val, y_main_val)

In [None]:
params = {'n_estimators':np.arange(100,301,100),
         'max_depth':np.arange(7,31,2),
          "criterion": ["gini", "entropy"]}
rfc_cv = GridSearchCV(RandomForestClassifier(), param_grid = params, cv =5, verbose = 2, n_jobs = -1).fit(X_main_train, y_main_train)
rfc_cv.best_score_

Logistic Regression outperformed Random Forest Classifier again. With only 6 features we were able to get 96.8% which is only 1% less than accuracy of model trained with all features. Even though accuracy is less than previous model, we can train our models much faster now and if we had more samples in our train set, we could see the difference really easy. I will stop here and consider Logistic Regression best model for this task. If you think this notebook is useful, please upvote. If you have any suggestions, please do write them in comments. so that I can improve myself 