In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Objective:
Classify a mobile into 4 price ranges based on features & specification of the gadget

### Data description:  
The dataset has 20 features-
* battery_power: battery capacity in mAh  
* blue: has bluetooth or not   
* clock_speed: processor speed   
* dual_sim: has dual sim or not   
* fc: front camera mega pixel   
* four_g: 4G or not  
* int_memory: internal memory in GB   
* m_dep: depth in cm   
* mobile_wt: weight   
* n_cores: core processors   
* pc: primary cam resolution   
* px_height: height in pixel  
* px_width: width in pixel   
* ram: RAM   
* sc_h: screen height   
* sc_w: width  
* talk_time: max talk time in single charge  
* three_g: 3G or not  
* touch_screen: touch screen or not   
* wifi: has wifi or not   


In [None]:
# load the necessary packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
# load the train & test datasets
input_path = r'../input/mobile-price-classification'
df_train, df_test = pd.read_csv(os.path.join(input_path,'train.csv')), pd.read_csv(os.path.join(input_path,'test.csv'))
df_train.head()

In [None]:
df_train.describe()

In [None]:
df_test.describe()

Note: Both px_height & sc_w have minimum values of 0, which doesn't make sense. Hence we check the number of entries having that issue, and remove those 

In [None]:
print("There are %d and %d entries where sc_w and px_height equal to 0, respectively in the train dataset"
      % (df_train[df_train.sc_w == 0].shape[0], df_train[df_train.px_height == 0].shape[0]))

print("There are %d and %d entries where sc_w and px_height equal to 0, respectively in the test dataset"
      % (df_test[df_test.sc_w == 0].shape[0], df_test[df_test.px_height == 0].shape[0]))

In [None]:
# Keeping the rows where both px_height & sc_w are greater than 0
df_test = df_test[(df_test.px_height!=0) & (df_test.sc_w!=0)]
df_train = df_train[(df_train.px_height!=0) & (df_train.sc_w!=0)]
df_train.info()

### Data Visualization & Preprocessing

In [None]:
sns.heatmap(df_train.corr(), vmin=-1, vmax=1)

Note: Feature pairs of (px_height, px_width), (sc_h, sc_w), and (pc, fc) are highly correlated. Therefore we multiply the individual features to form the new features, tot_res (px_height * px_width), dim (sc_h * sc_w), and cam (pc * fc). Next, we notice that the pair (three_g, four_g) show correlation as well. We replace the pair by the feature, network = (2 * four_g + three_g). Its value equals 1.5 for a phone that is both 4G and 3G compatible, while only 4G or 3G produces a value of 1 and 0.5, respectively.      

In [None]:

# feature modification
df_train['tot_res'] = np.sqrt(df_train.px_height * df_train.px_width)
df_train['dim'] = np.sqrt(df_train.sc_h * df_train.sc_w)
df_train['cam'] = np.sqrt(df_train.pc * df_train.fc)
df_train['network'] = (df_train.four_g*2 + df_train.three_g)/2

df_train = df_train.drop(['four_g','three_g','pc','fc','px_height', 'px_width', 'sc_h', 'sc_w'], axis = 1)

# separate the design matrix & the label 
X, Y = df_train.drop(['price_range'], axis=1), df_train["price_range"]

In [None]:
sns.pairplot(df_train[['battery_power', "tot_res", "dim",
                       "cam", "network", "price_range"]], hue = "price_range")

In [None]:
sns.jointplot(x = df_train['network'], y = df_train["price_range"], kind = "kde")

In [None]:
sns.jointplot(x = df_train['battery_power'], y = df_train["price_range"], kind = "kde")

In [None]:
sns.jointplot(x = df_train['ram'], y = df_train["price_range"], kind = "kde")

In [None]:
sns.jointplot(x = df_train['wifi'], y = df_train["price_range"], kind = "kde")

## Training & Classification
We will use three different classifiers, (a) logistic regression, (b) support vector machine, and (c) naive bayes, and select a model based on *accuracy & robustness*. The training dataset is split into a train and a test datasets. We will split the data at different proportions and compare the model accuracy score on each of the cases.

In [None]:
acc_scores = pd.DataFrame(columns = ["test_size","log_res","svm","nb"])

# define the classifiers
## a. logistic reg
lr_model = linear_model.LogisticRegression(penalty='l1', solver='saga',
                                            max_iter=400, random_state=0)
    
## b. support vector machine
svm_model = SVC(random_state = 0)
    
## c. naive bayes
nb_model = GaussianNB()


test_size = [0.1,0.15,0.2,0.25,0.3]
for x in test_size:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = x, random_state = 0)
        
    # train svm and naive bayes, and predict accuracy score on test set
    acc_svm = svm_model.fit(X_train, Y_train).score(X_test, Y_test)
    acc_nb = nb_model.fit(X_train, Y_train).score(X_test, Y_test)

    ## scale the features to run logistic regression model
    scaler = StandardScaler()

    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)

    acc_log = lr_model.fit(X_train, Y_train).score(X_test, Y_test)

    acc_scores = acc_scores.append({"test_size": x,
                                    "log_res": acc_log,
                                    "svm": acc_svm,
                                    "nb": acc_nb
                                   }, ignore_index = True)
    
acc_scores

From the summary of accuracy scores, logistic regression seems to be the model of choice and a test size in the range of 15% - 25% deems ideal. However, the svm accuracy scores remain more or less consistent against the test-train splits. To probe the stability of the models further, we vary the *random state* and compare the change in accuracy scores due to the variation.   

In [None]:
acc_rstate = pd.DataFrame(columns = ["random_state","log_res","svm","nb"])

r_state = [0,42,82,112,200]

for x in r_state:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, 
                                                        random_state = x)
    
    # define the classifiers
    ## a. logistic reg
    lr_model = linear_model.LogisticRegression(penalty='l1', solver='saga',
                                               max_iter=400, random_state=x)
    
    ## b. support vector machine
    svm_model = SVC(random_state = x)
    
    ## c. naive bayes
    nb_model = GaussianNB()
    
    # train svm and naive bayes, and predict accuracy score on test set
    acc_svm = svm_model.fit(X_train, Y_train).score(X_test, Y_test)
    acc_nb = nb_model.fit(X_train, Y_train).score(X_test, Y_test)

    ## scale the features to run logistic regression model
    scaler = StandardScaler()

    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)

    acc_log = lr_model.fit(X_train, Y_train).score(X_test, Y_test)

    acc_rstate = acc_rstate.append({"random_state": str(x),
                                    "log_res": acc_log,
                                    "svm": acc_svm,
                                    "nb": acc_nb
                                   }, ignore_index = True)
    
acc_rstate

It is a problematic to see that the accuracy score varies significantly while we vary the random state and it raises question about the robustness of the models. Let us review the confusion matrices produced by logistic regression classifier with the random state set to 82 and 200

In [None]:
for x in [82, 200]:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, 
                                                        random_state = x)
    lr_model = linear_model.LogisticRegression(penalty='l1', solver='saga',
                                               max_iter=400, random_state=x)
    
    ## scale the features to run logistic regression model
    scaler = StandardScaler()

    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)
    
    print("Confusion matrix with random state = %d" %x)
    print(confusion_matrix(Y_test, lr_model.fit(X_train, Y_train).predict(X_test)))

The model with random state = 200 works poorly, especially in classifying mobiles in the price range of 1 and 2. Let us apply the logistic regression classifier to the unknown test dataset and predict the price range

In [None]:
X_new = df_test.iloc[:,1:]

# feature modification
X_new['tot_res'] = np.sqrt(X_new.px_height * X_new.px_width)
X_new['dim'] = np.sqrt(X_new.sc_h * X_new.sc_w)
X_new['cam'] = np.sqrt(X_new.pc * X_new.fc)
X_new['network'] = (X_new.four_g * 2 + X_new.three_g)/2

X_new = X_new.drop(['four_g','three_g','pc','fc','px_height', 'px_width', 'sc_h', 'sc_w'], axis = 1)

final_res = pd.DataFrame()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, 
                                                        random_state = 82)

sample_id = df_test['id'].values

# define the logistic regression classifier
lr_model = linear_model.LogisticRegression(penalty='l1', solver='saga',
                                            max_iter=400, random_state=82)
    
# scale the features to run logistic regression model
scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)

scaler.fit(X_new)
X_new = scaler.transform(X_new)

# predict the target using logistic regression classifier
pred = lr_model.fit(X_train, Y_train).predict(X_new)    

final_res['id'] = sample_id
final_res["predicted_price_range"] = pred

final_res