In [None]:
# Getting the data 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Topics
1. Importing Libraries
2. Loading and data Understanding
3. Analysis & Visualization of the data
    - What is the average battery power for each price class of mobile?
    - Do all mobile phone which have Wifi have bluetooth?
    - Is the weight less for expensive Phones?
    - Influence of RAM on price range
    - Does having more cores/price influence battery duration?
    - Outlier Analysis  
4. Train-Test split
5. Scaling
6. Model Building
    - Logistic Regression
    - Random Forest
    - Gaussain NB
    - SVM
    - Gradient Boosting
7. Summary

# Importing Libraries


In [None]:
pip install sweetviz

In [None]:
#Libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import pandas_profiling as pandas_pf
import sweetviz as sv

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

import eli5
from eli5.sklearn import PermutationImportance

import warnings
warnings.filterwarnings('ignore')

# Loading and data understanding

In [None]:
# reading the given data set

train_df = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')

train_df.head()

In [None]:
# Data Info
train_df.info()
print('-'*50)
test_df.info()

**Inference**
- All the dtypes look in the correct format
- We can note that there is a ID column in test dataframe which is absent in train
- The target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).

**Assumption**
- As far we have seen most of the people look into
    - Battery power
    - DualSim (Maybe)
    - Internal Memory
    - Camera Pixel
    - Number of cores
    - RAM
    <br>
   <br>
 Based on the above mentioned factor we are going to see if there is any strong corelation or are they any other factors that also influence
    

In [None]:
# Dropping test column from test data set as it doesnt have any influence

test_df.drop('id', axis = 1, inplace = True)

In [None]:
# Statistical summary of the columns

train_df.describe()

In [None]:
# Shape of the data

print('Size of train data:{0}'.format(train_df.shape))
print('Size of test data:{0}'.format(test_df.shape))

# Analysis & Visualization of the data

In [None]:
# Finding missing value percentage

print(round(train_df.isnull().sum()/len(train_df)),2)
print('-'*50)
print(round(test_df.isnull().sum()/len(train_df)),2)

There is no missing values in the data and there is no need for cleaning of missing values

In [None]:
# Plotting a heat map to see the correlation between variables

plt.figure(figsize = (20,10))

sns.heatmap(train_df.corr(), annot = True, cmap = 'YlGnBu')

**Inferance**:
- Front camera mega pixels has a strong positive correlation with Primary Camera Mega Pixels as most of the consumers are looking for camera with high mega pixels
- Having 4G and 3G are strongly related
- We can see strong corelation of price with RAM and it as major influence
- Battery power also has influence of price

*For anyone who is interested on SweetViz here are links that will be helpful:*
 
* https://pypi.org/project/sweetviz/
* https://towardsdatascience.com/sweetviz-automated-eda-in-python-a97e4cabacde
* https://towardsdatascience.com/powerful-eda-exploratory-data-analysis-in-just-two-lines-of-code-using-sweetviz-6c943d32f34

### SweetViz

In [None]:
# Using SweetViz to for EDA

report = sv.analyze(train_df)

In [None]:
# To show report in new tab

# report.show_html('Report.html')

# Displaying report in notebook
report.show_notebook()

In [None]:
# comparing test and train data

compare_report = sv.compare([train_df, 'Train_Data'], [test_df, 'Test_Data'])

In [None]:
# Displaying report
compare_report.show_notebook()

**Inference**
- Count for most the classes with 2 values is 50-50
- Battery power has positive correlation of 0.21 with power range
- ~33% of the phone have clock speed in range 0.5GHz - 1.0GHz (Since the unit is not mentioned we will assume it to be GHz)
- ~36% of phones have less than 2MP of front camera
- Only 24% of phones dont have 3G
<br><br>
Let's explore somemore section in detail

In [None]:
# We will add another column with the name of price range so that it will be easy for interpreatation

price_map = {0:'Low Cost', 1:'Medium Cost', 2:'High Cost', 3:'Expensive'}

train_df['price_range_cat'] = train_df['price_range'].map(price_map)

### What is the average battery power for each price class of mobile?

In [None]:
# Plotting price distribution for different battey power

sns.barplot(x= 'price_range_cat', y = 'battery_power', data = train_df, order=['Low Cost', 'Medium Cost', 'High Cost', 'Expensive']
           , palette = 'Set1')

plt.title('Price Distribution for different Battery power ',fontsize = 15)
plt.ylabel('Average Battrty Power(mAh)',fontsize = 15)
plt.xlabel('Price Class', fontsize = 15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.show()

- As expected Expensive Mobile have more average battery power i.e. 1380 mAh
- Medium and High cost mobile have are in the same range
- Low Cost phones have average battery power low i.e. 1117 mAh

In [None]:
train_df['battery_power'].groupby(train_df['price_range_cat']).mean()

The black line on the bars is the error bar, they are graphical representation of the varibality of the data and used on graphs to indicate the error or uncertainty in a reported measurement.
- https://stackoverflow.com/questions/58362473/what-does-black-lines-on-a-seaborn-barplot-mean#:~:text=This%20is%20the%20error%20bar,Standard%20Deviation%20or%20STD%20line.&text=Size%20of%20confidence%20intervals%20to%20draw%20around%20estimated%20values.&text=If%20None%2C%20no%20bootstrapping%20will,bars%20will%20not%20be%20drawn.

### Do all mobile phone which have Wifi have bluetooth?

In [None]:
# Creating Crosstabluation for WiFi and BlueTooth

100*pd.crosstab(train_df['blue'], train_df['wifi'], rownames=['Bluetooth'],margins = True, margins_name = 'Total',
                 normalize = True).round(3)

In [None]:
# Subsetting the data to identify which price class of phone dont have WiFi and Bluetooth

sub_df = train_df[(train_df['wifi']==0) & (train_df['blue']==0)]
sub_df.head()

In [None]:
# Value COunt

sub_df['price_range_cat'].value_counts()

In [None]:
# Plotting Phone's not having Bluetooth and WiFi

sub_df['price_range_cat'].value_counts().plot(kind='barh')

plt.title("Phone's not having Bluetooth and WiFi")
plt.xlabel('Total Count', fontsize = 15)
plt.ylabel('Category of Phone', fontsize = 15)
plt.show()

**Inference:**
- Couldn't Identify any specfic pattern, when we see the crosstab there is ~25% for each category
- It was bit strange to note that there 116 expensive phones neither have Bluetooth nor Wifi

### Is the weight less for expensive Phones?

In [None]:
# Plotting Avg Mobile Weight across different phone category

sns.barplot(x='price_range_cat', y='mobile_wt', data = train_df, order=['Low Cost', 'Medium Cost', 'High Cost', 'Expensive'],
            palette='Set1')

plt.title('Avg Mobile Weight across different phone category')
plt.xlabel('Phone Class', fontsize = 15)
plt.ylabel('Avg Mobile Weight', fontsize = 15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.show()

In [None]:
# Mean across different price class

train_df['mobile_wt'].groupby(train_df['price_range_cat']).mean()

**Inference**:
It can be noted expensive mobile phones have slightly less weight 

### Influence of RAM on price range

In [None]:
# Drawing distribution across different price range

g = sns.FacetGrid(train_df, col = 'price_range',height=2.5, aspect=1)
g.map(plt.hist, 'ram', alpha = 0.5, bins = 20,edgecolor="black", color = 'g')
plt.show()

**Inference**:<br>
RAM Ranges for different phone range
- Low Cost: 0-2000 MB
- Medium Cost: 500-3000 MB
- High Cost: 1000-4000MB
- Expensive: 2000-4000MB

So Basically expensive phones have High RAM

### Does having more cores/price influence battery duration?

In [None]:
# Plotting Battery Duration for Different Cores

sns.lineplot(x='n_cores', y='talk_time', data= train_df,ci =None )
plt.ylim([0,20])

plt.title('Battery Duration for Different Cores')
plt.xlabel('Number of Cores', fontsize = 15)
plt.ylabel('Battey Duration', fontsize = 15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.show()

In [None]:
# Mean for Different price class

train_df['talk_time'].groupby(train_df['price_range_cat']).mean()

**Inference**:
- There is no significant effect of cores on battery duration as suspected
- On a average the battery duration is 11hrs
- There is not much effect of price on battery duration

In [None]:
# Dropping price_range_cat column as it was only used for EDA purpose

train_df.drop('price_range_cat', axis = 1, inplace = True)

In [None]:
# Outlier Analysis:

plt.figure

for i, col in enumerate(train_df.columns):
    plt.figure(i)
    sns.boxplot(train_df[col])

**Inference**:
There are no outliers in the data

# Train - Test Split

__Note__:<br>
We are just going to use train data for our model building purpose and not consider the test data, as the test data does not have the target varaible coulmn and we cannot compare our model answers to right answers


In [None]:
# Splitting independent and dependent varaibles

X = train_df.drop('price_range', axis = 1)
y = train_df['price_range']

In [None]:
# Creating training and test data

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size= 0.7, test_size= 0.3, random_state = 0)

In [None]:
# Storing the column names  for train and test
X_train_col = X_train.columns

X_test_col = X_test.columns

In [None]:
# We will convert the data into array as it will optimize more

X_train, y_train = np.array(X_train), np.array(y_train)

# Scaling

In [None]:
# Using Standard Scaler to scale

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# Reverting narray to DataFrame

X_train = pd.DataFrame(X_train, columns = X_train_col)
X_test = pd.DataFrame(X_test, columns = X_test_col)

# Model Building

__Note__:<br>
Here we will consider accuracy as metric of comparsion rather than presion/recall because:
 - The target class is almost balanced
 - Here we are not specifically focusing on one category of phone but instead looking at all, if we want to target one specific category then we can focus on it by using recall/precsion and adjusting it accordingly

## Logistic Regression

In [None]:
#define model and model parameters
model = LogisticRegression()
penalty = ['l2','l1']
c_values = [100, 10, 1.0, 0.1, 0.01]

#define grid
grid = dict(C = c_values, penalty = penalty)
cv = KFold(n_splits = 5)

grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = cv, n_jobs = -1)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
#Mean cross-validated score of the best_estimator

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# Creating model with the defined parameters

lr = LogisticRegression(C=100, penalty = 'l2')
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)

In [None]:
# Let's see the results

print('Accuracy of the Logistic Regression is {0}'.format(round(accuracy_score(y_test, lr_predict),2)))
lr_accuracy = accuracy_score(y_test, lr_predict)
print('-'*50)
print('\n')
print('Model Report:')
print('-'*50)
print(classification_report(y_test, lr_predict))

## Random Forest

In [None]:
#define model and model parameters

model = RandomForestClassifier(class_weight='balanced')

max_depth = [4,8,10]
criterion = ['gini','entropy']
min_samples_split = [2,3,5,7,9]
max_features = [5, 10]

#define grid
grid = dict(max_depth = max_depth, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features)
cv = KFold(n_splits = 5)

grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = cv,n_jobs = -1)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
#Mean cross-validated score of the best_estimator

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# Creating model with the defined parameters

rfc = RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 10, min_samples_split = 5)
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)

In [None]:
# Let's see the results

print('Accuracy of the Random Forest is {0}'.format(round(accuracy_score(y_test, rfc_predict),3)))
rfc_accuracy = accuracy_score(y_test, rfc_predict)
print('-'*50)
print('\n')
print('Model Report:')
print('-'*50)
print(classification_report(y_test, rfc_predict))

## Gaussian NB

In [None]:
# define model and model parameters

model = GaussianNB()
var_smoothing = np.logspace(0,-9, num=100)

# define grid
grid = dict(var_smoothing = var_smoothing)
cv = KFold(n_splits = 5)

grid_search = GridSearchCV(estimator = model, param_grid = grid,n_jobs = -1, cv = cv)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
# Mean cross-validated score of the best_estimator

print("Best %f using %s"%(grid_result.best_score_, grid_result.best_params_))

In [None]:
# Creating model with the defined parameters

gnb = GaussianNB(var_smoothing = 0.23)
gnb.fit(X_train, y_train)
gnb_predict = gnb.predict(X_test)

In [None]:
# Let's see the results

print('Accuracy of the GaussainNB is {0}'.format(round(accuracy_score(y_test, gnb_predict),3)))
gnb_accuracy = accuracy_score(y_test, gnb_predict)
print('-'*50)
print('\n')
print('Model Report:')
print('-'*50)
print(classification_report(y_test, gnb_predict))

## SVM

In [None]:
# define model and model parameters
model = SVC(kernel='rbf')
C = [50, 10, 1.0, 0.1, 0.01]
cv = KFold(n_splits = 5)

# define grid
grid = dict(C = C)

grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = cv, n_jobs = -1)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
# Mean cross-validated score of the best_estimator

print("Best %f using %s"%(grid_result.best_score_, grid_result.best_params_))

In [None]:
# Creating model with the defined parameters

svc = SVC(kernel = 'rbf', C = 1)
svc.fit(X_train, y_train)
svc_predict = svc.predict(X_test)

In [None]:
# Let's see the results

print('Accuracy of the SVC is {0}'.format(round(accuracy_score(y_test, svc_predict),3)))
svc_accuracy = accuracy_score(y_test, svc_predict)
print('-'*50)
print('\n')
print('Model Report:')
print('-'*50)
print(classification_report(y_test, svc_predict))

## Stochastic Gradient Boosting

In [None]:
# define model and parameters

model = GradientBoostingClassifier()
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
cv = KFold(n_splits = 5)

#Define Grid
grid = dict(learning_rate=learning_rate, subsample=subsample)
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv=cv, n_jobs = -1)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
# Mean cross-validated score of the best_estimator

print("Best %f using %s"%(grid_result.best_score_, grid_result.best_params_))

In [None]:
# Creating model with the defined parameters

sgb = GradientBoostingClassifier(learning_rate = 0.1, subsample = 0.7)
sgb.fit(X_train, y_train)
sgb_predict = sgb.predict(X_test)

In [None]:
# Let's see the results

print('Accuracy of the Gradient Boosting is {0}'.format(round(accuracy_score(y_test, sgb_predict),3)))
sgb_accuracy = accuracy_score(y_test, sgb_predict)
print('-'*50)
print('\n')
print('Model Report:')
print('-'*50)
print(classification_report(y_test, sgb_predict))

# Summary

In [None]:
# Creating a dataframe to summaries accuracy of different model

model_summary = pd.DataFrame({
    'ML Models':['Logistic Regression', 'Random Forest', 'GaussianNB', 'SVM', 'Gradient Boosting'],
    'Accuracy':[lr_accuracy, rfc_accuracy, gnb_accuracy, svc_accuracy, sgb_accuracy]
})
model_summary['Accuracy'] = round(model_summary['Accuracy'],2)
model_summary.sort_values(['Accuracy'], inplace = True, ascending = False)
model_summary.reset_index(drop = True,inplace = True)
model_summary

We can see that Logistic Regression has highest accuracy, based on this lets explore which are the important attributes that contribute more

In [None]:
# Creating permutation importance n fitting

prem = PermutationImportance(estimator = lr, random_state = 42)
prem.fit(X_train, y_train)

In [None]:
# Finding out the top KPI's

eli5.show_weights(estimator=prem, feature_names = X_test.columns.tolist())

**Inference**:
Top 3 KPI's are:
1. RAM
2. Battery Power
3. Pixel Height
<br>
That influence the price class of the phone

<h1>Upvote if you like my work❤️<br>
If you have any queries, doubt or any suggestion feel free to drop it in comment section<h1>