In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
train=pd.read_csv("/kaggle/input/mobile-price-classification/train.csv")
test=pd.read_csv("/kaggle/input/mobile-price-classification/test.csv")

In [None]:
print('Number of NULL values in training data ',sum(train.isna().sum()))
print('Number of NULL values in testing data ',sum(test.isna().sum()))

In [None]:
train.head()

# Identifying feature reltionship with price

**Battery And Size Analysis**

In [None]:
px.histogram(train,x='battery_power',facet_col='price_range',nbins=10,marginal='box',histnorm='percent')

The low cost phones (price_range=0) is skewed towards the low battery power region and the high cost phones (price_range=3) is skewed towards the higher bettery power region.

In [None]:
px.histogram(train,x='talk_time',facet_col='price_range',nbins=10,marginal='box',histnorm='percent')

The time on singe charge is evenly distributed acorss all price ranges and this quite makes sense, Phones that are having more features will consume the battery much quickly as compared to their counterparts and hence the talk time would be similar.

In [None]:
temp=train
temp['px_active']=(temp['px_height']*temp['px_width'])

In [None]:
px.scatter(temp,x='px_active',color='talk_time',y='battery_power',facet_col='price_range',marginal_x='box')

From above there a few things worth noting,
1. Many higher priced phones have more than 2M active pixels as compared to the other categories which would mean that they either have a large screen size or a better resolution.
2. In the low cost category there are a few phones with less battery life and very high talktime. These could be the older mobile phones with keypad.
3. As intuition suggests a few the higher priced phones have it all, more pixels, large battery size, a longer talktime and of course a higher price.

In [None]:
temp['screen_area']=temp['sc_h']*temp['sc_w']
px.scatter(temp,x='screen_area',color='battery_power',y='px_active',facet_col='price_range',facet_row='touch_screen')

**Camera Analysis**

In [None]:
px.histogram(temp,x='pc',facet_col='price_range',histnorm='percent',nbins=10)

In [None]:
px.histogram(temp,x='fc',facet_col='price_range',histnorm='percent',nbins=10)

Well one thing we can say is high megapixel cameras i.e above 20 MP for primary camera and above 14MP for front camera are very hard to find for any price range.

In [None]:
px.scatter(temp,x='pc',y='fc',color='price_range')

Here's something interesting in the above scatterplot,
1. The phone with the best combination of primary anf front camera falls in the low price region.(Point at the top right corner)
2. The phone with no camera (Point at the botttom left corner) falls in the higher priced category.

In [None]:
print("The low priced phone with the best camera")
print(temp[(temp['fc']==19) & (temp['pc']==20)])
print("The high priced phone with no camera!?")
print(temp[(temp['fc']==0) & (temp['pc']==0) & (temp['price_range']==3)][['pc','fc','price_range','touch_screen']])

In [None]:
print("Number of very high priced phones with no camera ")
print(temp[(train['fc']==0) & (train['pc']==0) & (train['price_range']==3)].shape[0])


**21 Very Highly priced mobile phones have no camera??? Doesn't make sense.....**

**RAM and storage analysis**

In [None]:
temp['ram_gb']=temp['ram']/1024

In [None]:
px.histogram(temp,x='ram_gb',facet_col='price_range',marginal='box')

RAM is the clearest indicator for determining the price of the phone.

In [None]:
px.scatter(temp,x='ram_gb',y='int_memory',color='price_range',facet_col='touch_screen',facet_row='four_g')

Clear distribution is visible here in all the plots and RAM alone might be enough to predict the price_range.

In [None]:
def baseline(x):
    if x>3:
        return 3
    elif x>2 and x<=3:
        return 2
    elif x<=2 and x>1:
        return 1
    elif x<=1:
        return 0


In [None]:
temp['Pred']=temp['ram_gb'].apply(lambda x:baseline(x))

In [None]:
accuracy_score(temp['price_range'],temp['Pred'])

The baseline model without any use of machine learning has an accuracy of 0.74 and that is quite good considering the fact that if we constantly predict a single class the accuracy will be 0.25. 

# Predicting the price range 

In [None]:
train.columns

The features that generally a user consider before buying a mobile phone are the features needed to predict the price change.
1. battery
2. RAM
3. pixel height-width
4. number of cores
5. memory 
6. camera

In [None]:
train['ram_gb']=train['ram']/1024
test['ram_gb']=test['ram']/1024
train['px_active']=train['px_height']*train['px_width']
test['px_active']=test['px_height']*test['px_width']
train['cam_total']=train['fc']+train['pc']
test['cam_total']=test['fc']+test['pc']

Normalizing the features and creating new features.

In [None]:
X=train[['battery_power','ram_gb','fc','pc','int_memory','n_cores','px_active','touch_screen','four_g']]
y=train['price_range']
X_test=test[['battery_power','ram_gb','fc','pc','int_memory','n_cores','px_active','touch_screen','four_g']]

In [None]:
i=max(X['battery_power'])
X['battery_power']=X['battery_power']/i
X_test['battery_power']=X_test['battery_power']/i

In [None]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.25)

In [None]:
forest=RandomForestClassifier(max_depth=8,min_samples_split=15,n_estimators=250)
forest.fit(X_train,y_train)

These are the parameters that I reached after running the model a few times and trying to avoid overfitting.

In [None]:
print("The score on the training data ",forest.score(X_train,y_train))

In [None]:
pred=forest.predict(X_valid)
print('The score on the validation set',accuracy_score(y_valid,pred))

Training the model on the whole training set.

In [None]:
forest=RandomForestClassifier(max_depth=8,min_samples_split=15,n_estimators=250)
forest.fit(X,y)

In [None]:
print('Score on the whole training data',forest.score(X,y))

Prediction on the test set..

In [None]:
prediction=forest.predict(X_test)