# Data Exploration and Prediction using Random Forest Classification/XGBoost (90%)


This notebook first look at the correlations & distribution and then predict the price range using random forest classification

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import math
import statistics as stats
import urllib
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("../input/mobile-price-classification/train.csv")
test = pd.read_csv("../input/mobile-price-classification/test.csv")
data.head(5)


In [None]:
data.info()

In [None]:
data.isnull().sum()

**Plot the correlation table**

In [None]:
# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, mask=mask, vmax=.3, center=0, annot=False, square=True, linewidths=.5, cbar_kws={"shrink": .5})

The battery power, ram and the phone height & width has high correlation with price_range. However, phone height and phone width has high correlation, to prevent multicolinearity, only of the height and width should be selected as variable.

The plots below looks at each variable distribution for each price range.

In [None]:
viz_1=sns.violinplot(data=data, x='price_range', y='battery_power')
viz_1.set_title('Density and distribution of battery_power for each price_range')

In [None]:
grouped = pd.DataFrame(data.groupby(['price_range'])['blue'].mean()).reset_index()

plt.bar(grouped['price_range'], grouped['blue'])
plt.xticks(np.arange(0, 4, 1))
plt.xticks(rotation=90)
plt.title('Percentage of Bluetooth for Each Price_range')
plt.show()

In [None]:
plt.boxplot(data['clock_speed']) 
# show plot 
plt.show() 

viz_2=sns.violinplot(data=data, x='price_range', y='clock_speed')
viz_2.set_title('Density and distribution of clock_speed for each price_range')

In [None]:
grouped1 = pd.DataFrame(data.groupby(['price_range'])['dual_sim'].mean()).reset_index()

plt.bar(grouped1['price_range'], grouped1['dual_sim'])
plt.xticks(np.arange(0, 4, 1))
plt.xticks(rotation=90)
plt.title('Percentage of Dual_sim for Each Price_range')
plt.show()

In [None]:
plt.boxplot(data['fc']) 
# show plot 
plt.show() 

viz_3=sns.violinplot(data=data, x='price_range', y='fc')
viz_3.set_title('Percentage and distribution of clock_speed for each price_range')

In [None]:
grouped2 = pd.DataFrame(data.groupby(['price_range'])['four_g'].mean()).reset_index()

plt.bar(grouped2['price_range'], grouped2['four_g'])
plt.xticks(np.arange(0, 4, 1))
plt.xticks(rotation=90)
plt.title('Percentage of 4g for Each Price_range')
plt.show()

In [None]:
plt.boxplot(data['int_memory']) 
# show plot 
plt.show() 

viz_4=sns.violinplot(data=data, x='price_range', y='int_memory')
viz_4.set_title('Density and distribution of clock_speed for each price_range')

In [None]:
plt.boxplot(data['m_dep']) 
# show plot 
plt.show() 

viz_5=sns.violinplot(data=data, x='price_range', y='m_dep')
viz_5.set_title('Density and distribution of mobile depth for each price_range')

In [None]:
plt.boxplot(data['mobile_wt']) 
# show plot 
plt.show() 

viz_6=sns.violinplot(data=data, x='price_range', y='mobile_wt')
viz_6.set_title('Percentage and distribution of mobile weight for each price_range')

In [None]:
plt.boxplot(data['n_cores']) 
# show plot 
plt.show() 

viz_7=sns.violinplot(data=data, x='price_range', y='n_cores')
viz_7.set_title('Density and distribution of mobile weight for each price_range')

In [None]:
plt.boxplot(data['pc']) 
# show plot 
plt.show() 

viz_8=sns.violinplot(data=data, x='price_range', y='pc')
viz_8.set_title('Density and distribution of primary_camera for each price_range')

In [None]:
plt.boxplot(data['px_width']) 
# show plot 
plt.show() 

viz_9=sns.violinplot(data=data, x='price_range', y='px_width')
viz_9.set_title('Density and distribution of width for each price_range')

In [None]:
plt.boxplot(data['px_height']) 
# show plot 
plt.show() 

viz_9=sns.violinplot(data=data, x='price_range', y='px_height')
viz_9.set_title('Density and distribution of height for each price_range')

In [None]:
plt.boxplot(data['ram']) 
# show plot 
plt.show() 

viz_10=sns.violinplot(data=data, x='price_range', y='ram')
viz_10.set_title('Density and distribution of ram for each price_range')

In [None]:
plt.boxplot(data['sc_h']) 
# show plot 
plt.show() 

viz_10=sns.violinplot(data=data, x='price_range', y='sc_h')
viz_10.set_title('Density and distribution of screen height for each price_range')

In [None]:
plt.boxplot(data['sc_w']) 
# show plot 
plt.show() 

viz_11=sns.violinplot(data=data, x='price_range', y='sc_w')
viz_11.set_title('Density and distribution of screen width for each price_range')

In [None]:
plt.boxplot(data['talk_time']) 
# show plot 
plt.show() 

viz_12=sns.violinplot(data=data, x='price_range', y='talk_time')
viz_12.set_title('Density and distribution of talk time for each price_range')

In [None]:
grouped3 = pd.DataFrame(data.groupby(['price_range'])['three_g'].mean()).reset_index()

plt.bar(grouped3['price_range'], grouped3['three_g'])
plt.xticks(np.arange(0, 4, 1))
plt.xticks(rotation=90)
plt.title('Percentage of 3g for Each Price_range')
plt.show()

In [None]:
grouped4 = pd.DataFrame(data.groupby(['price_range'])['touch_screen'].mean()).reset_index()

plt.bar(grouped4['price_range'], grouped4['touch_screen'])
plt.xticks(np.arange(0, 4, 1))
plt.xticks(rotation=90)
plt.title('Percentage of touch_screen for Each Price_range')
plt.show()

In [None]:
grouped5 = pd.DataFrame(data.groupby(['price_range'])['wifi'].mean()).reset_index()

plt.bar(grouped5['price_range'], grouped5['wifi'])
plt.xticks(np.arange(0, 4, 1))
plt.xticks(rotation=90)
plt.title('Percentage of wifi for Each Price_range')
plt.show()

In [None]:
y_corr = pd.DataFrame(corr.iloc[:,-1])
y_corr = y_corr.sort_values(by = 'price_range', ascending=False)
y_corr = y_corr.drop(['price_range'])
top_3_X = y_corr.head(3)

list(top_3_X.index)

dataset = data

for i in range(0,len(dataset.columns.values)-1):
    if data.columns.values[i] not in list(top_3_X.index):
        dataset = dataset.drop(columns=data.columns.values[i])
        
top_3_X

The top 3 highest correlation variables is selected. By examining the plots of these 3 variables, it seems that the distribution is different accross the price range. The difference in the distribution is most notably in the ram variables.

In [None]:
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)


In [None]:
# Fitting Random Forest Regression to the dataset

from sklearn.ensemble import RandomForestClassifier
regressor = RandomForestClassifier(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)
# Predicting a new result
predict_train = regressor.predict(X_train)
predict_val = regressor.predict(X_val)

predict_train_table = pd.DataFrame({'y_train':y_train, 'predict_train':predict_train, 'Correctness': y_train==predict_train})
predict_train_correctness = pd.DataFrame(predict_train_table.groupby(['Correctness'])['y_train'].count()).reset_index()
predict_val_table = pd.DataFrame({'y_val':y_val, 'predict_val':predict_val, 'Correctness': y_val==predict_val})
predict_val_correctness = pd.DataFrame(predict_val_table.groupby(['Correctness'])['y_val'].count()).reset_index()

trainScore = predict_train_correctness[predict_train_correctness['Correctness'] == True].y_train/len(predict_train_table)
valScore = predict_val_correctness[predict_val_correctness['Correctness'] == True].y_val/len(predict_val_table)

print('Train Score: %.2f Accuracy' % (trainScore))
print('Val Score: %.2f Accuracy' % (valScore))

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
predict_train = regressor.predict(X_train)
predict_val = regressor.predict(X_val)

# y_test = scaler.fit_transform(y_test.reshape(-1,1))

predict_train_table = pd.DataFrame({'y_train':y_train, 'predict_train':predict_train, 'Correctness': y_train==predict_train})
predict_train_correctness = pd.DataFrame(predict_train_table.groupby(['Correctness'])['y_train'].count()).reset_index()
predict_val_table = pd.DataFrame({'y_val':y_val, 'predict_val':predict_val, 'Correctness': y_val==predict_val})
predict_val_correctness = pd.DataFrame(predict_val_table.groupby(['Correctness'])['y_val'].count()).reset_index()

trainScore = predict_train_correctness[predict_train_correctness['Correctness'] == True].y_train/len(predict_train_table)
valScore = predict_val_correctness[predict_val_correctness['Correctness'] == True].y_val/len(predict_val_table)

print('Train Score: %.2f Accuracy' % (trainScore))
print('Val Score: %.2f Accuracy' % (valScore))

Both the random forest tree classification model and XGBoost model predict the validation score of 90%.