In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
# -*- coding: utf-8 -*-
"""Mobile_Price_Range_PredictionSHU.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Cshk_Vq6Ih0mll_HzEIdshnUOLHgjnvx

### <font color =red > **Problem Statement:**
 <font color =green >**In the competitive mobile phone market companies want
to understand sales data of mobile phones and factors which drive the prices.
The objective is to find out some relation between features of a mobile phone(eg:- RAM,
Internal Memory, etc) and its selling price. In this problem, we do not have to predict the
actual price but a price range indicating how high the price is.**
### <font color =red >**Data Description -**
* **Battery_power** - Total energy a battery can store in one time measured in mAh
* **Blue** - Has bluetooth or not
* ***Clock_speed*** - speed at which microprocessor executes instructions
* ***Dual_sim*** - Has dual sim support or not
* ***Fc*** - Front Camera mega pixels
* ***Four_g*** - Has 4G or not
* ***Int_memory*** - Internal Memory in Gigabytes
* ***M_dep*** - Mobile Depth in cm
* ***Mobile_wt*** - Weight of mobile phone
* ***N_cores*** - Number of cores of processor
* ***Pc*** - Primary Camera mega pixels
* ***Px_height*** - Pixel Resolution Height
* ***Px_width*** - Pixel Resolution Width
* ***Ram*** - Random Access Memory in Mega Bytes
* ***Sc_h*** - Screen Height of mobile in cm
* ***Sc_w*** - Screen Width of mobile in cm
* ***Talk_time*** - longest time that a single battery charge will last when you are
* ***Three_g*** - Has 3G or not
* ***Touch_screen*** - Has touch screen or not
* ***Wifi*** - Has wifi or not
* ***Price_range*** - This is the target variable with value of 
* 0(low cost), 
* 1(medium cost),
* 2(high cost) and
* 3(very high cost).
* Thus our target variable has 4 categories so basically it is a Multiclass classification problem.
"""

# Commented out IPython magic to ensure Python compatibility.
# importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
# setting max display column limit to 30
pd.set_option("display.max_columns", 30)
#set style
# plt.style.use('seaborn')
sns.set()
# setting font weight,label weight,title weight to bold and setting title size,label size,fontsize.
plt.rcParams["font.weight"] = "bold"
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams["axes.titlesize"] = 25
plt.rcParams["axes.titleweight"] = 'bold'
plt.rcParams['xtick.labelsize']=15
plt.rcParams['ytick.labelsize']=15
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["legend.fontsize"] = 15
plt.rcParams["legend.title_fontsize"] = 15
import warnings
warnings.filterwarnings('ignore')

# importing all essential libraries.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix,confusion_matrix,roc_curve,roc_auc_score,auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
# import KNN imputer frio sklearn
from sklearn.impute import KNNImputer

# Mounting drive
from google.colab import drive
drive.mount('/content/drive')

# CSV file loaction path
df=pd.read_csv('/content/drive/MyDrive/1.Project/Mobile-Price-Range-Prediction/data_mobile_price_range.csv',encoding ='latin')

# first 5 rows
df.head()

# Last 5 Rows of the data
df.tail()

# lets Check the shape of data
print(df.shape)

# Checking Statistic of the data
df.describe().T

"""<font color = red >Some Basic Observations:

<font color = Blue >* We can see that sc_width and px_height has minimum value 0. which is not possible in any mobile. We need to handle this mismatch.
"""

# screen width value as 0.
print(df[df['sc_w']==0].shape[0])

# px_hieght value as 0.
print(df[df['px_height']==0].shape[0])

# As there are only 2 observations having px_height=0. so we will drop it.
df=df[df['px_height']!=0]



"""##<font color = red > Nearest Neighbor Imputation (KNNImputer)

<font color = Blue >Missing values are imputed using the k-Nearest Neighbors approach where a Euclidean distance is used to find the nearest neighbors.

<font color = Blue >Let’s take the above example of the titanic dataset to see how it works.
* <font color = Blue >Before using KNN Imputer we need to replace 0 with NAN values. so that it will work.
"""

# Replacing 0 with NAN so that we can implement KNN Imputer.
df['sc_w']=df['sc_w'].replace(0,np.nan)

impute_knn = KNNImputer(n_neighbors=1)
df=pd.DataFrame(impute_knn.fit_transform(df),columns=df.columns)

# Checking shape
df.shape

# observations having sc_w value as 0.
df[df['sc_w']==0].shape[0]

"""**<font color = Blue >Thus we have handled the mismatched values of the data.**"""

# Checking the datatypes, non null values
df.info()

"""* **<font color = green >We don't have any object data type in our data set.**
* **<font color = green >Also we have Zero null values in data set.**
* **<font color = green >Price_range is our target variable.**
"""

# Checking Null values.
df.isna().sum()

# Checking Duplicate values
print(f' We have {df.duplicated().sum()} duplicate values in dataset.')

"""# <font color = red >**EDA( Exploratory Data Analysis)**"""

# lets have look at our target variable's counts
price_range_values=df['price_range'].value_counts()
price_range_values

"""<font color = green >***Wow.!  we have almost equal number of obseravtions for each category. Thus we don't have imbalanced target variable.Accuracy score will be the best evalaution metric for us to select the model.***<font color = green >

<font color = green >***This is the target variable with value of***<font color = green >
* <font color = green >***0=low cost,***
* <font color = green >***1=medium cost,***
* <font color = green >***2=high cost,***
* <font color = green >***3=very high cost.***
"""

# Visualizing the Target variable's class distribution.
labels = ["low cost", "medium cost", "high cost", "very high cost"]
price_range_values.plot.pie(explode=[0.05]*4,labels=labels,autopct='%1.1f%%',figsize=(12,8),fontsize=15)

"""<font color = green >* **We can see that our target variable is equally distributed.** 

<font color = green >* **Thus we don't have to worry about data imbalance and there is no need of oversampling or undersampling.Which is good for us.**

##<font color = red > **Correlation of target varaible Dependent variable with the other independent variable.**
"""

corr= df.corr()
plt.figure(figsize=(20,10))
sns.heatmap(corr,annot=True, cmap=plt.cm.Accent_r)

"""###<font color = red > **Observations:**
* ***RAM has strong positive correlation with the Price_range. and we know that Mobiles with high RAM are very costly. Thus RAM increases price range also increase.***
* ***Battery_power also has positive correlation with the price range.Generally mobiles having high prices comes with good battery power.***
* ***Also px_height and px_width (Pixel Resolution Height and width) are positively correlated. Generally High price range mobiles have good resolutions.***

* ***Four_g and Three_g are highly positvely correlated. Nowdays most of the smart mobiles has both type of options. This could be the reason that they are correlated.***

* ***primary camera i.e pc and front camera fc are positively correlated.***
* ***sc_h and sc_w are positively correlated.***
"""

df.columns

"""# <font color = red >**Univariate Analysis of Categorical columns.**"""




# Plotting the piecharts for binary categorical variables.
plt.figure(figsize=(15,10))
rows=3
col=3
count=1
var_list=['blue','dual_sim','four_g','three_g','wifi','touch_screen']
labels=['Yes','No']
for var in var_list:
  plt.subplot(rows,col,count)
  df[var].value_counts().plot.pie(autopct='%1.1f%%',fontsize=12,labels=labels)
  plt.title(f'has {var} or not',fontsize=14)
  plt.tight_layout()
  count=count+1

"""<font color = green >* ***1 means it has the specifications.***

<font color = green >* ***0 means it do not have the specifications.***

<font color = green >* ***Percentage Distribution of Mobiles having bluetooth,dual sim, 4G,wifi and touchscreen are almost 50 %.***

<font color = green >* ***very few mobiles(23.9%) do not have Three_g.***

##<font color = red > **Relation Between RAM,Battery_power,px_height and px_width**
"""

# plotting  price_range vs other fearures
plt.figure(figsize=(20,10))
labels=['0=low_cost','1=medium_cost','2=high_cost','3=very_high_cost']
rows=2   
cols=2
counter=1
variables=['ram','battery_power','px_height','px_width']
for var in variables:
  plt.subplot(rows,cols,counter)
  sns.barplot(x=df['price_range'],y=df[var])
  plt.title(f'Price range v/s {var}')
  counter=counter+1
  plt.tight_layout()

"""###<font color = red > **Observations:**
<font color = green >* ***Mobiles having RAM more than 3000MB falls under Very high cost category.As RAM increases price range also increases.***

<font color = green >* ***Mobiles having RAM less than 1000 MB falls under low cost category.***

<font color = green >* ***Mobiles with battery power more than 1300 mAh has very high cost. And Mobiles with battery power between 1200 and 1300 mAh falls under medium and high cost category.***

<font color = green >* ***Mobiles with more than 700 pixel height and width more than 1300 has very high cost.***
"""

# Checking the counts of binary categorical variables by grouping price range.
grup_by_price=df.groupby(['price_range']).agg({'blue':'value_counts','dual_sim':'value_counts','four_g':'value_counts','three_g':'value_counts','touch_screen':'value_counts','wifi':'value_counts'}).unstack()

# visualising by plotting barchat.
grup_by_price.plot.bar(figsize=(25,15))
plt.title('Count of phones in each price range with supported or not supported mobile specifications.')
plt.xlabel('Price range')
plt.ylabel('Count of phones')
plt.legend(loc='upper center')

"""###<font color = red > **Observations**
 
<font color = blue >***We can see that each price range category has equal number of mobiles phones having both supporting and non supporting specifications.***
"""

list_2=['n_cores','m_dep']
for item in list_2:
  df.groupby(['price_range'])[item].value_counts().unstack().plot.bar(figsize=(15,6))
  plt.title(f'Price range grouped by {item}')
  plt.ylabel('No. of phones')

"""###<font color = red >**Observation** 
* ***There are very few mobiles in price range 0 and 1 with lesser no of cores.***
* ***Most of the mobiles in price range 2 and 3 are with high no of cores.***

* ***Number of phones with less thickness is high and count of phones with high thickness is low.***

#<font color = red >**Let's Check which numerical feature is driving the price range most.**
"""

df.columns

list_1=['battery_power','clock_speed','fc','int_memory','m_dep','mobile_wt','n_cores','pc','px_height','px_width','sc_h','ram','sc_w','talk_time']
# plotting boXplot and distribution
counter=1
for var in list_1:
    plt.figure(figsize=(10,6))
    sns.lineplot(x=df['price_range'],y=df[var])

"""###<font color = green > **Observations**
* ***For class 1 and class2 battery power range is almost similar. As battery power increases price also increases whcih is quite obivious.***
* ***Mobiles in very high price range(Class 3) has less weight compared to other classes.That means as weight of mobiles decrease price increases.***
* ***Mobiles having max screen  height and width falls in very high price category. We can see in linechart of sc_width and sc_height from class 2 screen width and hieght starts increasing with price. Similar case is with px_height and px_width. As resolution of screen increases the price also increases***
***RAM has clear relationship with price range we saw that in correlation matrix also.***

#**<font color = red > Mobiles with both 3G and 4G.**
"""

# Mobiles have both 3G and 4G specifications.
df_3g_4g=df[(df['three_g']==1) & (df['four_g']==1)]

df_3g_4g['price_range'].value_counts()

plt.figure(figsize=(10,5))
sns.countplot(df_3g_4g['price_range'])
plt.xticks(ticks=[0,1,2,3],labels=['Low cost','medium cost','high cost','very high cost'])
plt.title("Mobiles with 3G and 4G features ")
plt.show()

"""<font color = green >* As we can see from low cost to very high cost mobiles have both features.

#**<font color = red >Mobiles with 3G**
"""

df_3g=df[(df['three_g']==1) & (df['four_g']!=1)]

df_3g

plt.figure(figsize=(10,5))
sns.countplot(df_3g['price_range'])
plt.xticks(ticks=[0,1,2,3],labels=['Low cost','medium cost','high cost','very high cost'])
plt.title("Mobiles with only 3G  features ")
plt.show()

"""<font color = red >Observations:

<font color = green >* Mobiles which has very high cost are very less likely to have 3G.

<font color = green >* There are more chances that high cost mobile will have 3G in it.

# <font color = red >Mobiles not having no 3G and 4G.
"""

No_3g_4G_df=df[(df['three_g']!=1) & (df['four_g']!=1)]

No_3g_4G_df.head(3)

plt.figure(figsize=(10,5))
sns.countplot(No_3g_4G_df['price_range'])
plt.xticks(ticks=[0,1,2,3],labels=['Low cost','medium cost','high cost','very high cost'])
plt.title("Mobiles not having 3G and 4G features ")
plt.show()

"""<font color = green >* Its very obvious that low cost mobiles will not have 3G and 4G.

<font color = green >* Mobiles with very high cost may have 5G. As we know technologies are changes everytime.
"""

# n_cores v/s price range
plt.figure(figsize=(30,10))
sns.countplot(df['price_range'],hue=df['n_cores'])
plt.title("n_cores v/s Price range")
plt.legend(loc='best')

"""<font color = green >Observations:
* Price range 0 has majority of phones with 2 core processors
* Price range 1 has majority of phones with 1 and 4 core processors
* Price range 2 has majority of phones with 4 core processors
* Price range 3 has majority of phones with 5 and 7 core processors
"""

plt.figure(figsize=(10, 5))
sns.countplot(df['price_range'],hue=df['four_g'])
plt.title("Mobiles with 4G features ")
plt.legend(loc='best')

"""<font color = blue >Majority of phones of only price range 2 dont have 4G service."""

plt.figure(figsize=(10, 5))
sns.countplot(df['price_range'],hue=df['blue'])
plt.title("Mobiles with bluetooth features ")
plt.legend(loc='best')

"""<font color = blue > Majority of phones of price range from 0 to 2 dont have bluetooth on other hand price range of 3 have bluetooth service.

# <b> <font color = red  >Let's Check the distribution of numerical columns and Outliers.
"""

df.columns

# numeric col list
numeric_col=['battery_power','clock_speed','fc','int_memory','m_dep','mobile_wt','n_cores','pc','px_height',
             'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time',]

# plotting boXplot and distribution
for var in numeric_col:
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    fig = sns.boxplot(y=df[var],color='lightgreen')
    fig.set_title('')
    fig.set_ylabel(var)
    plt.subplot(1, 2, 2)
    fig = sns.distplot(df[var],color='lightgreen')
    fig.set_xlabel(var)
    plt.show()

"""* Data is well distrubted.
* fc and px_height has some outliers.

#**<font color = red >Outlier Treatment**
"""

Q1 = df["fc"].quantile(0.25)
Q3 = df['fc'].quantile(0.991)
IQR = Q3-Q1
# Outliers are present after Quartile 3. so we will take datapoints before Q3.
df = df[(df['fc'] <= Q3)]

Q1 = df["px_height"].quantile(0.25)
Q3 = df['px_height'].quantile(0.991)
IQR = Q3-Q1
# Outliers are present after Quartile 3. so we will take datapoints before Q3.
df = df[(df['px_height'] <= Q3)]

# Visualising whether oultliers are removed or not.
for var in ['fc','px_height']:
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    fig = sns.boxplot(y=df[var],color='lightblue')
    fig.set_title('')
    fig.set_ylabel(var)
    plt.subplot(1, 2, 2)
    fig = sns.distplot(df[var],color='lightblue')
    fig.set_xlabel(var)
    plt.show()

"""Thus we can see in box plot no oultiers are present."""

df.shape

"""
#**<font color = red >SO we have handled Outliers.**"""

# create copy of mobile_data
df2=df.copy()

"""#**<font color = red >Feature Selection**"""

# checking newly created column.
df.head()

corr= df.corr()
plt.figure(figsize=(25,10))
sns.heatmap(corr,annot=True, cmap=plt.cm.Accent_r)

# Separating X variables(indpendent variables) and Y(dependent variable) variable.
X=df.drop('price_range',axis=1)
y=df["price_range"]

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# No we Will select the  top 12 important features
bestfeatures = SelectKBest(score_func=chi2, k=12)
fit = bestfeatures.fit(X,y)

# creating score's and coolumn's dataframe
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# conacatenating above two dataframes
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']

# Check dataframe
featureScores

# 12 features with highest chi squared statistic 
print(featureScores.nlargest(12,'Score'))

# 12 features with highest chi squared statistic are selected as independent variables.
X=df[['ram','px_height','battery_power','px_width','mobile_wt','int_memory','sc_h','talk_time','sc_w','fc','n_cores','pc']]

# dependent varaible
y=df['price_range']

"""#**<font color = red >Predictive Modeling:**
<font color = blue > Algorithms used for predictive modeling:
* 1) Decision Tree
* 2) Random Forest classifier
* 3) Gradient Boosting Classifier
* 4) K-nearest Neighbour classifier
* 5) XG Boost Classifier 
* 6) Support Vector Machine(SVM)

<font color = green >**As Decision tree,random forest and enssembles trees do not require Feature scaling as these are Tree based models. So we will be using X_train and X_test which are not scaled.**

<font color = green > **For K nearest Neighbors and SVM  we will be usingseX_train_scaled and X_test_scaled. That is we we will use Standardised data. i.e. Scaled data.
As these are distance based Algorithms.**
"""

# splitting the data into Train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Scaling the data.
# creating an object of MinMax Scaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)   # fitting the X_train
X_test_scaled=scaler.transform(X_test)         # transforming X_test

X_train_scaled

X_test_scaled

# Defining a fucnction for plotting roc curve
def plot_Auc_roc(y_test,pred_prob):
  '''It will take y_test and y predicted probabilities
  as input and will plot the roc curve.'''
  fpr = {}
  tpr = {}
  thresh ={}
  n_class = 4
  for i in range(n_class):    
      fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)
  # plotting    
  plt.figure(figsize=(12,8))
  plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label= ((f'Class 0(Low cost) vs Rest, AUC= {round(auc(fpr[0],tpr[0]),4)}')))
  plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label=((f'Class 1(Medium cost) vs Rest, AUC= {round(auc(fpr[1],tpr[1]),4)}')))
  plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label=((f'Class 2(High cost) vs Rest, AUC= {round(auc(fpr[2],tpr[2]),4)}')))
  plt.plot(fpr[3], tpr[3], linestyle='--',color='red', label=((f'Class 3(Very High cost) vs Rest, AUC= {round(auc(fpr[3],tpr[3]),4)}')))
  plt.title('Multiclass ROC curve')
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive rate')
  plt.legend(loc='best')
  plt.savefig('Multiclass ROC',dpi=300);

# creating a class list
Class_cat = ['low cost','medium cost', 'high cost', 'very high cost']

"""# <font color=red> <b> 1) Decision Tree Classifier:</b>

**<font color=green>Decision trees and ensemble methods do not require feature scaling to be performed as they are not sensitive to the the variance in the data.**
**So here we will use X_train,X_test,y_test and Y_train which are not scaled.**

<font color=blue>***Decision Tree with default hyperparameters:***
"""

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Focus\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Focus\AppData\Local\Temp/ipykernel_864/4106654858.py", line 48, in <module>
    import matplotlib.pyplot as plt
  File "C:\Users\Focus\anaconda3\lib\site-packages\matplotlib\pyplot.py", line 36, in <module>
    import matplotlib.colorbar
  File "C:\Users\Focus\anaconda3\lib\site-packages\matplotlib\colorbar.py", line 38, in <module>
    from matplotlib import _api, collections, cm, colors, contour, ticker
  File "C:\Users\Focus\anaconda3\lib\site-packages\matplotlib\contour.py", line 17, in <module>
    import matplotlib.font_manager as font_manager
  File "C:\Users\Focus\anaconda3\lib\site-packages\matplotlib\font_manager.py", line 1447, in <module>
    fontManager = _load_fontmanager()
  File "C:\Users\Focus\anaconda3\lib\site-packages\matplotlib\font_manager.py", line 144

TypeError: object of type 'NoneType' has no len()