## RANDOM FOREST REGRESSOR ON 50_STARTUPS DATASET

In [32]:
from warnings import filterwarnings
filterwarnings('ignore')

### READ DATASET --> 50_STARTUPS

In [33]:
import pandas as pd 
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [35]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [36]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RND,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
ADMIN,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
MKT,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
PROFIT,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


### SEPEARATE X AND Y FEATURE

In [37]:
X = df.drop(columns=['PROFIT'])
Y = df[['PROFIT']]

In [38]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [39]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [40]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [41]:
cat

['STATE']

In [42]:
con

['RND', 'ADMIN', 'MKT']

### PREPROCESSING STEPS

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [44]:
num_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='median')),
                           ('scaler',StandardScaler())])
cat_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore'))])
pre=ColumnTransformer([('num',num_pipe,con),
                       ('cat',cat_pipe,cat)])

In [45]:
X_pre= pre.fit_transform(X)
X_pre[0:5]

array([[ 2.01641149,  0.56075291,  2.15394309,  0.        ,  0.        ,
         1.        ],
       [ 1.95586034,  1.08280658,  1.9236004 ,  1.        ,  0.        ,
         0.        ],
       [ 1.75436374, -0.72825703,  1.62652767,  0.        ,  1.        ,
         0.        ],
       [ 1.55478369, -0.09636463,  1.42221024,  0.        ,  0.        ,
         1.        ],
       [ 1.5049372 , -1.07991935,  1.28152771,  0.        ,  1.        ,
         0.        ]])

In [46]:
cols=pre.get_feature_names_out()
cols

array(['num__RND', 'num__ADMIN', 'num__MKT', 'cat__STATE_California',
       'cat__STATE_Florida', 'cat__STATE_New York'], dtype=object)

In [47]:
X_pre=pd.DataFrame(X_pre,columns=cols)
X_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


### TRAIN TEST SPLIT

In [48]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest= train_test_split(X_pre,Y,test_size=0.33,random_state=42)

In [49]:
xtrain.shape

(33, 6)

In [50]:
xtest.shape

(17, 6)

### MODEL CREATION --> RANDOM FOREST REGRESSOR

In [51]:
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(random_state=42)
rfr.fit(xtrain,ytrain)

In [52]:
rfr.score(xtrain,ytrain)

0.9917524727694338

In [53]:
rfr.score(xtest,ytest)

0.9376759216401851

### HYPERPARAMETER TUNNING

In [54]:
params= {'max_depth':[3,4,5,6,7,8,9],
         'min_samples_split':[6,7,8,9,10,11,12],
         'criterion':['squared_error','absoulute_error']}

In [55]:
from sklearn.model_selection import GridSearchCV
model=RandomForestRegressor()
gscv= GridSearchCV(model,param_grid=params,cv=5,scoring='r2')
gscv.fit(xtrain,ytrain)

In [56]:
gscv.best_params_

{'criterion': 'squared_error', 'max_depth': 8, 'min_samples_split': 6}

In [57]:
gscv.best_score_

0.8784276078148581

In [58]:
best_rfr= gscv.best_estimator_
best_rfr

In [59]:
best_rfr.score(xtrain,ytrain)

0.9677286190485443

In [60]:
best_rfr.score(xtest,ytest)

0.9490838274176189

### PREDICTING THE VALUES

#### SO THE BASE MODEL IS BEST 

In [61]:
model.fit(xtrain,ytrain)
ypred_tr=model.predict(xtrain)
ypred_ts=model.predict(xtest)

In [62]:
ypred_tr[0:5]

array([ 46722.6781,  38767.5668, 136546.3766, 147593.662 , 123922.5967])

In [63]:
ytrain.head()

Unnamed: 0,PROFIT
46,49490.75
47,42559.73
15,129917.04
9,149759.96
16,126992.93


In [64]:
ypred_ts[0:5]

array([129614.9115,  94347.2753,  98884.1549,  46120.5096, 125811.6919])

In [65]:
ytest.head()

Unnamed: 0,PROFIT
13,134307.35
39,81005.76
30,99937.59
45,64926.08
17,125370.37


### WITH NEW SAMPLE DATA

In [66]:
x_new= df.sample(10)
x_new

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
44,22177.74,154806.14,28334.72,California,65200.33
17,94657.16,145077.58,282574.31,New York,125370.37
32,63408.86,129219.61,46085.25,California,97427.84
25,64664.71,139553.16,137962.62,California,107404.34
39,38558.51,82982.09,174999.3,California,81005.76
1,162597.7,151377.59,443898.53,California,191792.06
48,542.05,51743.15,0.0,New York,35673.41
14,119943.24,156547.42,256512.92,Florida,132602.65
9,123334.88,108679.17,304981.62,California,149759.96
38,20229.59,65947.93,185265.1,New York,81229.06


In [67]:
T = x_new[['PROFIT']]
T

Unnamed: 0,PROFIT
44,65200.33
17,125370.37
32,97427.84
25,107404.34
39,81005.76
1,191792.06
48,35673.41
14,132602.65
9,149759.96
38,81229.06


In [68]:
xnew= x_new.drop(columns=['PROFIT'])
xnew

Unnamed: 0,RND,ADMIN,MKT,STATE
44,22177.74,154806.14,28334.72,California
17,94657.16,145077.58,282574.31,New York
32,63408.86,129219.61,46085.25,California
25,64664.71,139553.16,137962.62,California
39,38558.51,82982.09,174999.3,California
1,162597.7,151377.59,443898.53,California
48,542.05,51743.15,0.0,New York
14,119943.24,156547.42,256512.92,Florida
9,123334.88,108679.17,304981.62,California
38,20229.59,65947.93,185265.1,New York


In [69]:
xnew_pre= pre.transform(xnew)
xnew_pre

array([[-1.13430539,  1.20641936, -1.50907418,  1.        ,  0.        ,
         0.        ],
       [ 0.46072013,  0.85566632,  0.59101672,  0.        ,  0.        ,
         1.        ],
       [-0.22694868,  0.28392381, -1.36244978,  1.        ,  0.        ,
         0.        ],
       [-0.19931169,  0.65648914, -0.60351673,  1.        ,  0.        ,
         0.        ],
       [-0.77382036, -1.38312156, -0.29758328,  1.        ,  0.        ,
         0.        ],
       [ 1.95586034,  1.08280658,  1.9236004 ,  1.        ,  0.        ,
         0.        ],
       [-1.61043334, -2.50940884, -1.74312698,  0.        ,  0.        ,
         1.        ],
       [ 1.01718075,  1.26919939,  0.37574227,  0.        ,  1.        ,
         0.        ],
       [ 1.09181921, -0.45664025,  0.77610744,  1.        ,  0.        ,
         0.        ],
       [-1.17717755, -1.99727037, -0.21278487,  0.        ,  0.        ,
         1.        ]])

In [70]:
xnew_pre=pd.DataFrame(xnew_pre,columns=cols)
xnew_pre

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,-1.134305,1.206419,-1.509074,1.0,0.0,0.0
1,0.46072,0.855666,0.591017,0.0,0.0,1.0
2,-0.226949,0.283924,-1.36245,1.0,0.0,0.0
3,-0.199312,0.656489,-0.603517,1.0,0.0,0.0
4,-0.77382,-1.383122,-0.297583,1.0,0.0,0.0
5,1.95586,1.082807,1.9236,1.0,0.0,0.0
6,-1.610433,-2.509409,-1.743127,0.0,0.0,1.0
7,1.017181,1.269199,0.375742,0.0,1.0,0.0
8,1.091819,-0.45664,0.776107,1.0,0.0,0.0
9,-1.177178,-1.99727,-0.212785,0.0,0.0,1.0


In [71]:
E =model.fit(xtrain,ytrain)
pred = model.predict(xnew_pre)
pred

array([ 65667.7432, 130119.6048,  96927.8009, 101489.8969,  92956.5417,
       188804.8805,  37443.8825, 136939.8441, 148304.813 ,  76063.0375])

In [72]:
x_new['predictions']=pred

In [73]:
x_new

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT,predictions
44,22177.74,154806.14,28334.72,California,65200.33,65667.7432
17,94657.16,145077.58,282574.31,New York,125370.37,130119.6048
32,63408.86,129219.61,46085.25,California,97427.84,96927.8009
25,64664.71,139553.16,137962.62,California,107404.34,101489.8969
39,38558.51,82982.09,174999.3,California,81005.76,92956.5417
1,162597.7,151377.59,443898.53,California,191792.06,188804.8805
48,542.05,51743.15,0.0,New York,35673.41,37443.8825
14,119943.24,156547.42,256512.92,Florida,132602.65,136939.8441
9,123334.88,108679.17,304981.62,California,149759.96,148304.813
38,20229.59,65947.93,185265.1,New York,81229.06,76063.0375


In [74]:
x_new.to_csv('predictions.csv',index=False)