Loading a few of the required libraries for our Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy import stats
from scipy.stats import norm, skew, kurtosis
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
import pylab as p
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Load the train and test datasets and get their head()...

In [None]:
train_data=pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
test_data=pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')

In [None]:
test_data.head()

In [None]:
train_data.head()

In [None]:
test_data.columns

In [None]:
train_data.columns

Looking carefully at the train and test datasets, i can see that the train data has a price_range column unlike the test data which instead has an id column


Getting the dataset sizes

In [None]:
print('test_data : {}, train_data :{}'.format(test_data.shape, train_data.shape))

Get more information about the datasets

In [None]:
test_data.info()

In [None]:
train_data.info()

From the information above i can see that all the feaatures have numerical values i.e either floats or integers



Data descriptions

In [None]:
test_data.describe()

In [None]:
train_data.describe()

In [None]:
train_data['price_range'].value_counts()

In [None]:
size = train_data['price_range'].value_counts()
plt.figure(figsize=(8,6))
plt.style.use('seaborn-paper')
plt.pie(size, labels=[3,2,1,0],shadow=True, autopct='%1.1f%%', colors=['cyan','darkred', 'darkgreen', 'darkblue'])
plt.title('A pie chart showing price range distributions among the data', fontsize=14, color='purple')
plt.show()

In [None]:
print('Kurtosis : {}'.format(kurtosis(train_data['price_range'])))
print('Skew : {}'.format(skew(train_data['price_range'])))

From the above information we can tell that our price_range data follows a platykurtic type of kurtosis hence it has thin tails and a skewness of zero shows that our price_range follows a normal distribution

In [None]:
size = train_data['three_g'].value_counts()
plt.figure(figsize=(8,6))
plt.style.use('seaborn-paper')
plt.pie(size, labels=[0,1],shadow=True, autopct='%1.1f%%', colors=['y', 'white'])
plt.title('A pie chart showing three_g distributions among the data', fontsize=14, color='purple')
plt.show()

In [None]:
size = train_data['four_g'].value_counts()
plt.figure(figsize=(8,6))
plt.style.use('seaborn-paper')
plt.pie(size, labels=[0,1],shadow=True, autopct='%1.1f%%', colors=['cyan', 'green'])
plt.title('A pie chart showing four_g distributions among the data', fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(18,18))
correlation = train_data.corr()
sns.heatmap(correlation,square=True,annot=True,vmax=0.9, color='b')

Heatmap above shows the relationship between different features in the data and as you can see the px_width & px_height and sc_w & sc_h are very highly related ...

In [None]:
sns.catplot(x='price_range', y='ram', kind='swarm', data=train_data)

In [None]:
sns.catplot(x='price_range', y='ram', kind='box', data=train_data)



From the plot above we can see that there's a very high relationship between the price range and the ram in that as the ram increases, the price range also goes up.. meaning the ram is one of the very important faetures to look at while modeling to get the mobile price predictions

In [None]:
plt.figure(figsize=(10,8))
sns.catplot(x='price_range', y='mobile_wt', kind='box', data=train_data)
plt.title('Distributions between price_range with respect to mobile_wt', color='darkgreen', fontsize=13)

In [None]:
plt.figure(figsize=(10,8))
sns.catplot(x='price_range', y='px_height', kind='box', data=train_data)
plt.title('Distributions between price_range with respect to px_height', color='darkgreen', fontsize=13)

In [None]:
plt.figure(figsize=(10,8))
sns.catplot(x='price_range', y='px_width', kind='box', data=train_data)
plt.title('Distributions between price_range with respect to px_width', color='green', fontsize=13)

In [None]:
plt.figure(figsize=(10,8))
sns.catplot(x='price_range', y='battery_power', kind='box',hue='blue', data=train_data)
plt.title('Distributions between price_range and bluetooth with respect to battery_power', color='darkred', fontsize=13)

In [None]:
#Plotly to try and see some interactive graphs
import plotly.offline as pyo
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
import cufflinks as cf
cf.go_offline()
pyo.init_notebook_mode()
print(__version__)

In [None]:
train_data.iplot(kind='scatter', x='sc_w', y='sc_h', mode='markers', colors='black',size=10)

In [None]:
train_data.iplot(kind='scatter', x='px_width', y='px_height', mode='markers', size=8)

From the above scatter maps we can see that we shall have to drop one of the features for our modeling since they are highly correlated hence making them very similar

In [None]:
train_data['fc'].iplot(kind='hist', bins=40, xTitle='Mega pixels', yTitle='Frequency', colors='cyan')
train_data['pc'].iplot(kind='hist', bins=40, xTitle='Mega pixels',yTitle='Frequency', colors='darkred')

In [None]:
sns.pointplot(train_data['talk_time'], y=train_data['price_range'], data=train_data)
plt.title('Point plot displaying how price ranges with talk_time', fontsize=13)

In [None]:
test_data['px_area'] = test_data['px_height'] * test_data['px_width']
test_data['phone_area'] = test_data['sc_w'] * test_data['sc_h']
test_data.drop(['px_width', 'px_height', 'sc_w', 'sc_h', 'id'], axis=1, inplace=True)

In [None]:
train_data['px_area'] = train_data['px_height'] * train_data['px_width']
train_data['phone_area'] = train_data['sc_w'] * train_data['sc_h']
train_data.drop(['px_width', 'px_height', 'sc_w', 'sc_h'], axis=1, inplace=True)
train_data

In [None]:
from sklearn.model_selection import train_test_split
X = train_data.drop('price_range', axis=1)
y = train_data['price_range']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=20)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rnd = RandomForestClassifier(max_depth=8, n_estimators=700, random_state=0, n_jobs=-1)
rnd.fit(X_train, y_train)

In [None]:
rnd.score(X_train, y_train)

In [None]:
rnd.score(X_test, y_test)

Ideally looking at our features, few features should be important which favours the use of Lasso Regression but looking at our  heatmap, not all features are as important so i guess we try using both Lasso and Ridge Regression to see our scores and see which model to use

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.001, max_iter=1000, random_state=20)
lasso.fit(X_train, y_train)

In [None]:
lasso.score(X_train, y_train)

In [None]:
lasso.score(X_test, y_test)

Using Lasso raises so many questions, as i raise the values of alpha to above 1, 10, 100, 1000 and so on, i tend to get very low scores tending to zero.. Let me try ridge and see how it scores here on our dataset

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1, max_iter=1000, random_state=20)
ridge.fit(X_train, y_train)

In [None]:
ridge.score(X_train, y_train)

In [None]:
ridge.score(X_test, y_test)

Ridge gives very great scores of the test and train sets of the data so i believe its one of the models we should look at

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
lm.score(X_train, y_train)

In [None]:
lm.score(X_test, y_test)






Well despite all the great scores from Lasso, Ridge and Linear Regression, we can't use either because our target variable is discrete and these are best suited for continous target variables i.e they will end up giving me predictions that are floats

In [None]:
from lightgbm import LGBMClassifier
reg = LGBMClassifier(learning_rate=0.1, n_estimators=700,  max_depth=8, random_state=0, n_jobs=-1)
reg.fit(X_train, y_train)

In [None]:
reg.score(X_train, y_train)

In [None]:
reg.score(X_test, y_test)

Much as the LGBMClassifier gives the perfect score for the train_data i think it over fits since the dataset is small plus i don't think it's best for my prediction since there's a slightly big deviation between the test score and train score as compared to other algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
mod = LogisticRegression(C=0.1, random_state=0, n_jobs=-1, max_iter=100)
mod.fit(X_train, y_train)

In [None]:
mod.score(X_train, y_train)

In [None]:
mod.score(X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
trees = DecisionTreeClassifier(random_state=20, max_depth=5, criterion='entropy')
model = trees.fit(X_train, y_train)
model

In [None]:
trees.score(X_train, y_train)

In [None]:
trees.score(X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'max_depth':[5, 1], 'criterion':['entropy', 'gini'], 'random_state':[20,5]}
gridz = GridSearchCV(DecisionTreeClassifier(), param_grid=params, refit=True,verbose=3)
gridz.fit(X_train, y_train)

In [None]:
gridz.best_params_

In [None]:
gridz.best_estimator_

In [None]:
final_results = trees.predict(test_data)
final_results

In [None]:
test_data = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')
test_data['id']
final = pd.DataFrame({'id':test_data.id, 'price_range': final_results})
final