## RANDOM FOREST REGRESSOR ON 50_STARTUPS DATASET

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

### READ DATASET --> 50_STARTUPS

In [2]:
import pandas as pd 
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RND,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
ADMIN,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
MKT,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
PROFIT,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


### SEPEARATE X AND Y FEATURE

In [6]:
X = df.drop(columns=['PROFIT'])
Y = df[['PROFIT']]

In [7]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [8]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [9]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [10]:
cat

['STATE']

In [11]:
con

['RND', 'ADMIN', 'MKT']

### PREPROCESSING STEPS

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [13]:
num_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='median')),
                           ('scaler',StandardScaler())])
cat_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore'))])
pre=ColumnTransformer([('num',num_pipe,con),
                       ('cat',cat_pipe,cat)])

In [14]:
X_pre= pre.fit_transform(X)
X_pre[0:5]

array([[ 2.01641149,  0.56075291,  2.15394309,  0.        ,  0.        ,
         1.        ],
       [ 1.95586034,  1.08280658,  1.9236004 ,  1.        ,  0.        ,
         0.        ],
       [ 1.75436374, -0.72825703,  1.62652767,  0.        ,  1.        ,
         0.        ],
       [ 1.55478369, -0.09636463,  1.42221024,  0.        ,  0.        ,
         1.        ],
       [ 1.5049372 , -1.07991935,  1.28152771,  0.        ,  1.        ,
         0.        ]])

In [15]:
cols=pre.get_feature_names_out()
cols

array(['num__RND', 'num__ADMIN', 'num__MKT', 'cat__STATE_California',
       'cat__STATE_Florida', 'cat__STATE_New York'], dtype=object)

In [16]:
X_pre=pd.DataFrame(X_pre,columns=cols)
X_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


### TRAIN TEST SPLIT

In [17]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest= train_test_split(X_pre,Y,test_size=0.33,random_state=42)

In [18]:
xtrain.shape

(33, 6)

In [19]:
xtest.shape

(17, 6)

### MODEL CREATION --> RANDOM FOREST REGRESSOR

In [20]:
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(random_state=42)
rfr.fit(xtrain,ytrain)

In [21]:
rfr.score(xtrain,ytrain)

0.9917524727694338

In [22]:
rfr.score(xtest,ytest)

0.9376759216401851

### HYPERPARAMETER TUNNING

In [23]:
params= {'max_depth':[3,4,5,6,7,8,9],
         'min_samples_split':[6,7,8,9,10,11,12],
         'criterion':['squared_error','absoulute_error']}

In [24]:
from sklearn.model_selection import GridSearchCV
model=RandomForestRegressor()
gscv= GridSearchCV(model,param_grid=params,cv=5,scoring='r2')
gscv.fit(xtrain,ytrain)

In [25]:
gscv.best_params_

{'criterion': 'squared_error', 'max_depth': 7, 'min_samples_split': 6}

In [26]:
gscv.best_score_

0.8716707798084607

In [27]:
best_rfr= gscv.best_estimator_
best_rfr

In [28]:
best_rfr.score(xtrain,ytrain)

0.9652520144876301

In [29]:
best_rfr.score(xtest,ytest)

0.944303228674771

### PREDICTING THE VALUES

In [30]:
ypred_tr=best_rfr.predict(xtrain)
ypred_ts=best_rfr.predict(xtest)

In [31]:
ypred_tr[0:5]

array([ 47710.38018053,  44298.16162815, 140293.64963878, 149238.08460283,
       119596.36147962])

In [32]:
ytrain.head()

Unnamed: 0,PROFIT
46,49490.75
47,42559.73
15,129917.04
9,149759.96
16,126992.93


In [33]:
ypred_ts[0:5]

array([131900.47446938,  95006.66988983,  99093.19999967,  45710.72312648,
       131707.33900986])

In [34]:
ytest.head()

Unnamed: 0,PROFIT
13,134307.35
39,81005.76
30,99937.59
45,64926.08
17,125370.37


### WITH NEW SAMPLE DATA

In [35]:
x_new= df.sample(10)
x_new

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
47,0.0,135426.92,0.0,California,42559.73
44,22177.74,154806.14,28334.72,California,65200.33
34,46426.07,157693.92,210797.67,California,96712.8
27,72107.6,127864.55,353183.81,New York,105008.31
2,153441.51,101145.55,407934.54,Florida,191050.39
13,91992.39,135495.07,252664.93,California,134307.35
36,28663.76,127056.21,201126.82,Florida,90708.19
12,93863.75,127320.38,249839.44,Florida,141585.52
23,67532.53,105751.03,304768.73,Florida,108733.99
43,15505.73,127382.3,35534.17,New York,69758.98


In [36]:
T = x_new[['PROFIT']]
T

Unnamed: 0,PROFIT
47,42559.73
44,65200.33
34,96712.8
27,105008.31
2,191050.39
13,134307.35
36,90708.19
12,141585.52
23,108733.99
43,69758.98


In [37]:
xnew= x_new.drop(columns=['PROFIT'])
xnew

Unnamed: 0,RND,ADMIN,MKT,STATE
47,0.0,135426.92,0.0,California
44,22177.74,154806.14,28334.72,California
34,46426.07,157693.92,210797.67,California
27,72107.6,127864.55,353183.81,New York
2,153441.51,101145.55,407934.54,Florida
13,91992.39,135495.07,252664.93,California
36,28663.76,127056.21,201126.82,Florida
12,93863.75,127320.38,249839.44,Florida
23,67532.53,105751.03,304768.73,Florida
43,15505.73,127382.3,35534.17,New York


In [38]:
xnew_pre= pre.transform(xnew)
xnew_pre

array([[-1.62236202,  0.50772188, -1.74312698,  1.        ,  0.        ,
         0.        ],
       [-1.13430539,  1.20641936, -1.50907418,  1.        ,  0.        ,
         0.        ],
       [-0.60068212,  1.31053525, -0.00187862,  1.        ,  0.        ,
         0.        ],
       [-0.03551899,  0.23506854,  1.17427116,  0.        ,  0.        ,
         1.        ],
       [ 1.75436374, -0.72825703,  1.62652767,  0.        ,  1.        ,
         0.        ],
       [ 0.4020776 ,  0.51017895,  0.34395679,  1.        ,  0.        ,
         0.        ],
       [-0.99157015,  0.20592469, -0.08176257,  0.        ,  1.        ,
         0.        ],
       [ 0.44325987,  0.21544906,  0.32061744,  0.        ,  1.        ,
         0.        ],
       [-0.13620072, -0.56221127,  0.77434891,  0.        ,  1.        ,
         0.        ],
       [-1.28113364,  0.21768152, -1.44960468,  0.        ,  0.        ,
         1.        ]])

In [39]:
xnew_pre=pd.DataFrame(xnew_pre,columns=cols)
xnew_pre

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,-1.622362,0.507722,-1.743127,1.0,0.0,0.0
1,-1.134305,1.206419,-1.509074,1.0,0.0,0.0
2,-0.600682,1.310535,-0.001879,1.0,0.0,0.0
3,-0.035519,0.235069,1.174271,0.0,0.0,1.0
4,1.754364,-0.728257,1.626528,0.0,1.0,0.0
5,0.402078,0.510179,0.343957,1.0,0.0,0.0
6,-0.99157,0.205925,-0.081763,0.0,1.0,0.0
7,0.44326,0.215449,0.320617,0.0,1.0,0.0
8,-0.136201,-0.562211,0.774349,0.0,1.0,0.0
9,-1.281134,0.217682,-1.449605,0.0,0.0,1.0


In [40]:
pred = best_rfr.predict(xnew_pre)
pred

array([ 44298.16162815,  68618.93264692,  98619.27816996, 110269.29614514,
       182287.4919169 , 131900.47446938,  80025.26193278, 131900.47446938,
       107599.68597022,  67888.80379692])

In [41]:
x_new['predictions']=pred

In [42]:
x_new

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT,predictions
47,0.0,135426.92,0.0,California,42559.73,44298.161628
44,22177.74,154806.14,28334.72,California,65200.33,68618.932647
34,46426.07,157693.92,210797.67,California,96712.8,98619.27817
27,72107.6,127864.55,353183.81,New York,105008.31,110269.296145
2,153441.51,101145.55,407934.54,Florida,191050.39,182287.491917
13,91992.39,135495.07,252664.93,California,134307.35,131900.474469
36,28663.76,127056.21,201126.82,Florida,90708.19,80025.261933
12,93863.75,127320.38,249839.44,Florida,141585.52,131900.474469
23,67532.53,105751.03,304768.73,Florida,108733.99,107599.68597
43,15505.73,127382.3,35534.17,New York,69758.98,67888.803797


In [47]:
x_new.to_csv('predictions.csv',index=False)