In [1]:
# target variable is price_range
# Loading the required library

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
mp = pd.read_csv(r"F:\carreer\trainRF.csv") # loading the data for analysing and data preprocessing
pd.set_option('display.max_rows',500)

In [3]:
mp.shape # to finding the no of data and columns

(2000, 21)

In [4]:
mp.head() # see the data

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
mp.isnull().sum() # to find the nulls

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [6]:
mp.info() # to check the type of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

# Data Sampling

In [7]:
from sklearn.model_selection import train_test_split # Sampling the data

In [8]:
mp_train,mp_test=train_test_split(mp,test_size=.2)

In [9]:
mp_train_x = mp_train.iloc[:,0:-1]
mp_train_y = mp_train.iloc[:,-1]

In [10]:
mp_test_x = mp_test.iloc[:,0:-1]
mp_test_y = mp_test.iloc[:,-1]

In [11]:
print(mp_train_x.shape)
print(mp_train_y.shape)
print(mp_test_x.shape)
print(mp_test_x.shape)

(1600, 20)
(1600,)
(400, 20)
(400, 20)


# Building the model

In [1]:
from sklearn.ensemble import RandomForestClassifier
rfc_mp = RandomForestClassifier()

In [14]:
rfc_mp.fit(mp_train_x, mp_train_y)

In [15]:
pred_rfc = rfc_mp.predict(mp_test_x)

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
tab_rfc = confusion_matrix(mp_test_y , pred_rfc)

In [18]:
tab_rfc # confusion matrix

array([[96,  7,  0,  0],
       [ 4, 84, 16,  0],
       [ 0,  9, 75,  5],
       [ 0,  0, 12, 92]], dtype=int64)

# Accuracy

In [19]:
tab_rfc.diagonal().sum()/ tab_rfc.sum()

0.8675

# feature importance

In [20]:
feature_importance_df = pd.DataFrame({'Featured' :mp_train_x.columns , "Importance" : rfc_mp.feature_importances_})

In [21]:
feature_importance_df.sort_values("Importance" , ascending = False)

Unnamed: 0,Featured,Importance
13,ram,0.475762
0,battery_power,0.070618
12,px_width,0.057038
11,px_height,0.05686
8,mobile_wt,0.04006
6,int_memory,0.039336
16,talk_time,0.030611
10,pc,0.030409
15,sc_w,0.0285
2,clock_speed,0.028483


# tuning the parameter

In [39]:
from sklearn.ensemble import RandomForestClassifier
rfc_mp = RandomForestClassifier(n_estimators= 500 , criterion= 'entropy' , max_depth= 10)
        # here default no of tree is 100 i.e n_estimator --->but use it wisely 
        # though we increase no of trees at beyond a point but it will not affect on model the model preformace will be same
        # but as u increase tree time taken will increase
        # as like decicision treee we can change the criterion 
        # as it is also consist of no of decision tree so gini and entropy hyperparameter are same here

In [40]:
rfc_mp.fit(mp_train_x, mp_train_y) # building the model

In [41]:
pred_rfc = rfc_mp.predict(mp_test_x)

In [42]:
from sklearn.metrics import confusion_matrix

In [43]:
tab_rfc = confusion_matrix(mp_test_y , pred_rfc)

In [44]:
tab_rfc

array([[95,  8,  0,  0],
       [ 3, 85, 16,  0],
       [ 0, 11, 72,  6],
       [ 0,  0, 11, 93]], dtype=int64)

# Accuracy

In [45]:
tab_rfc.diagonal().sum()/ tab_rfc.sum()

0.8625

# Feature Importance

In [37]:
feature_importance_df = pd.DataFrame({'Featured' :mp_train_x.columns , "Importance" : rfc_mp.feature_importances_})

In [38]:
feature_importance_df.sort_values("Importance" , ascending = False)

Unnamed: 0,Featured,Importance
13,ram,0.706099
0,battery_power,0.060425
11,px_height,0.049661
12,px_width,0.047375
8,mobile_wt,0.019407
6,int_memory,0.016602
15,sc_w,0.012952
16,talk_time,0.012247
14,sc_h,0.011568
10,pc,0.011143


# note that dont try to plot the randomforest because it will be created 
# but it will not able to understand properly for humabeing