In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Load the dataset

In [None]:
df = pd.read_csv('../input/mobile-price-classification/train.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# check for null values

df.isnull().any().any()

In [None]:
print(list(df.columns))

# Feature Description

## Categorical Variables

1. blue - Has bluetooth or not
2. dual_sim - Has dual sim support or not
3. four_g - Has 4G or not
4. n_cores - Number of cores of processor
5. three_g - Has 3G or not
6. touch_screen - Has touch screen or not
7. wifi - Has wifi or not
8. price_range - This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).


## Numerical Variables

1. battery_power - Total energy a battery can store in one time measured in mAh 
2. clock_speed - Speed at which microprocessor executes instructions
3. fc - Front camera mega pixels
4. int_memory - Internal Memory in Gigabytes
5. m_dep - Mobile depth in cm
6. mobile_wt - Weight of mobile phone
7. pc - Primary Camera mega pixels
8. px_height - Pixel Resolution Height
9. px_width - Pixel Resolution Width
10. ram - Random Access Memory in Mega Bytes
11. sc_h - Screen Height of mobile in cm
12. sc_w - Screen Width of mobile in cm
13. talk_time - longest time that a single battery charge will last when you are

# Exploratory Data Analysis

## Lets see first the correlation of independent variables to target variable

In [None]:
corr = df.corr()
corr

In [None]:
plt.figure(figsize=(17,10))
sns.heatmap(df.corr(),annot=True,fmt='.2f',cmap='Reds')
plt.show()

In [None]:
# lets see the features with high correlation to price range
# we can see that ram was highly correlated to price range

corr['price_range'].drop(['price_range']).sort_values(ascending=False)

In [None]:
# distribution of price range 
# 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost)

sns.countplot(x='price_range', data=df)
plt.show()

## Lets explore categorical variables

### Distribution of data based on ram

In [None]:
plt.figure(figsize=(10,8))
sns.kdeplot(df[df['price_range']==0]['ram'],shade=True)
sns.kdeplot(df[df['price_range']==1]['ram'],shade=True)
sns.kdeplot(df[df['price_range']==2]['ram'],shade=True)
sns.kdeplot(df[df['price_range']==3]['ram'],shade=True)
plt.legend(['low cost ram','medium cost ram','high cost ram','very high cost ram'])
plt.show()

### Bluetooth and Price range

In [None]:
sns.countplot(x='blue',data=df)
plt.show()

In [None]:
# in this case, having or not having bluetooth cant predict the price_range

sns.countplot(x='blue',hue='price_range',data=df)
plt.show()

### Dual sim feature and Phone range

In [None]:
sns.countplot(x='dual_sim',data=df)
plt.show()

In [None]:
# the distribution of price ranges in phone with and without dual sim feature are equal hence 
# it is too hard to classify the price range

sns.countplot(x='dual_sim',hue='price_range',data=df)
plt.show()

### phone is 4g and price range

In [None]:
sns.countplot(x='four_g',data=df)
plt.show()

In [None]:
# same case as above

sns.countplot(x='four_g',hue='price_range',data=df)
plt.show()

### Number of core processors and price range

In [None]:
sns.countplot(x='n_cores',data=df)
plt.show()

In [None]:
n_cores = sns.FacetGrid(df, col = "n_cores",hue='n_cores',col_wrap=4)
n_cores.map(sns.countplot,'price_range',order=[0,1,2,3])
plt.show()

### Phone is 3g and price range

In [None]:
sns.countplot(x='three_g',data=df)
plt.show()

In [None]:
# distribution of price range is also equal

sns.countplot(x='three_g',hue='price_range',data=df)
plt.show()

### Touchscreen phone and price range

In [None]:
sns.countplot(x='touch_screen',data=df)
plt.show()

In [None]:
sns.countplot(x='touch_screen',hue='price_range',data=df)
plt.show()

### Wifi and Price range

In [None]:
sns.countplot(x='wifi',data=df)
plt.show()

In [None]:
sns.countplot(x='wifi',hue='price_range',data=df)
plt.show()

## Top 5 features with highest correlation to target variable

In [None]:
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
sns.boxplot(y='ram',data=df,width=0.4,ax=ax[0])
sns.boxplot(y='battery_power',data=df,width=0.4,ax=ax[1],color='red')
ax[0].set_title('Ram')
ax[1].set_title('Battery power')
fig,ax = plt.subplots(nrows=1,ncols=3,figsize=(15,4))
sns.boxplot(y='px_width',data=df,width=0.4,ax=ax[0],color='green')
sns.boxplot(y='px_height',data=df,width=0.4,ax=ax[1],color='yellow')
sns.boxplot(y='int_memory',data=df,width=0.4,ax=ax[2],color='violet')
ax[0].set_title('Pixel resolution width')
ax[1].set_title('Pixel resolution height')
ax[2].set_title('Internal Memory (GB)')
plt.show()

In [None]:
# we can see the minimum pixel resolution in height is 0
df[['ram','battery_power','px_width','px_height','int_memory']].describe().T

In [None]:
# we will drop all the rows with px_height less than 75
df1 = df[df['px_height']>75]

# Model building

* i will only consider the top five features with high correlation to target variable
* i didnt include categorical value since most of them have equal distribution for price ranges

In [None]:
df2 = df1[['ram','battery_power','px_width','px_height','int_memory','price_range']]

In [None]:
df2.shape

In [None]:
df2.head()

## Separate independent variable to target variable

In [None]:
X = df2.drop(['price_range'],axis=1)
y = df2['price_range']

In [None]:
X.head()

In [None]:
y.head()

## Knn Algorithm 

In [None]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

In [None]:
print('Length of train dataset: ',len(X_train))
print('Length of test dataset: ',len(X_test))

### Scale first the data

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

### Applying the algorithm

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(scaled_X_train,y_train)

In [None]:
y_predicted = knn_model.predict(scaled_X_test)

### Confusion matrix and Classification Report

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
confusion_matrix_result = confusion_matrix(y_test,y_predicted)
confusion_matrix_result

In [None]:
sns.heatmap(confusion_matrix_result,annot=True,cmap='Reds',fmt='.0f')
plt.show()

In [None]:
print('KNN accuracy score: ',accuracy_score(y_test,y_predicted))

In [None]:
classification_report_result = classification_report(y_test,y_predicted)
print(classification_report_result)

# Find best model using GridSearchCV

* I will create seperate pipelines for each model for gridsearchcv rather than a script with all the models on it

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, make_pipeline

## Logistic Regression

In [None]:
pipe = make_pipeline(MinMaxScaler(),LogisticRegression())
grid_param = [
                {
                    'logisticregression': [LogisticRegression()],
                    'logisticregression__C': [0.8,1,1.2]
                }
]

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

print(best_model.best_estimator_)
print('\nThe mean accuracy using Logistic Regression is: ',best_model.score(X_test,y_test))

## SVM

In [None]:
pipe = make_pipeline(SVC())
grid_param = [
                {
                    'svc': [SVC()],
                    'svc__kernel': ['linear','rbf']
                }
]

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

print(best_model.best_estimator_)
print('\nThe mean accuracy using SVM is: ',best_model.score(X_test,y_test))

## Decision Tree

In [None]:
pipe = make_pipeline(DecisionTreeClassifier())
grid_param = [
                {
                    'decisiontreeclassifier': [DecisionTreeClassifier()],
                    'decisiontreeclassifier__splitter': ['best','random'],
                    'decisiontreeclassifier__min_samples_split': range(2,20)
                }
]

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

print(best_model.best_estimator_)
print('\nThe mean accuracy using Decision Tree is: ',best_model.score(X_test,y_test))

## Random Forest

In [None]:
pipe = make_pipeline(RandomForestClassifier())
grid_param = [
                {
                    'randomforestclassifier': [RandomForestClassifier()],
                    'randomforestclassifier__n_estimators': range(11,51,5),
                    'randomforestclassifier__min_samples_split': range(2,20)
                }
]

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

print(best_model.best_estimator_)
print('\nThe mean accuracy using Random Forest is: ',best_model.score(X_test,y_test))

### We can see that SVM gives us the higher accuracy

In [None]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train,y_train)

In [None]:
y_predicted = svm_model.predict(X_test)

In [None]:
confusion_matrix_result = confusion_matrix(y_test,y_predicted)
confusion_matrix_result

In [None]:
sns.heatmap(confusion_matrix_result,annot=True,cmap='Reds',fmt='.0f')
plt.show()

In [None]:
print('SVM accuracy score: ',accuracy_score(y_test,y_predicted))

In [None]:
classification_report_result = classification_report(y_test,y_predicted)
print(classification_report_result)