In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
train_df=pd.read_csv('../input/30-days-of-ml-2/training.csv')
test_df=pd.read_csv('../input/30-days-of-ml-2/test.csv')
sample_df=pd.read_csv('../input/30-days-of-ml-2/sample.csv')

In [2]:
train_df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15761950,Female,48,138000,1
1,15782530,Female,33,113000,0
2,15588080,Female,53,104000,1
3,15589449,Male,39,106000,1
4,15782806,Female,27,31000,0


In [3]:
train_df.shape

(320, 5)

In [4]:
test_df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15657163,Male,35,58000
1,15577514,Male,43,129000
2,15807909,Male,19,85000
3,15646091,Female,46,32000
4,15730688,Male,41,52000


In [5]:
test_df.shape

(80, 4)

In [6]:
sample_df.head()

Unnamed: 0,User ID,Purchased
0,15657163,0
1,15577514,0
2,15807909,0
3,15646091,0
4,15730688,0


In [7]:
sample_df['Purchased'].value_counts()

0    80
Name: Purchased, dtype: int64

In [8]:
train_df.tail(1)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
319,15750447,Male,37,70000,1


In [9]:
train_df=train_df.drop(columns='User ID',axis=1)
train_df.tail()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
315,Male,35,55000,0
316,Female,53,82000,1
317,Male,38,71000,0
318,Male,35,50000,0
319,Male,37,70000,1


In [10]:
from sklearn.preprocessing import OneHotEncoder
oh=OneHotEncoder(drop='first',sparse=False)

In [11]:
train_df['Gender']=oh.fit_transform(train_df[['Gender']])
train_df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,0.0,48,138000,1
1,0.0,33,113000,0
2,0.0,53,104000,1
3,1.0,39,106000,1
4,0.0,27,31000,0


In [12]:
x=train_df.drop(columns='Purchased',axis=1)
y=train_df['Purchased']

In [13]:
x

Unnamed: 0,Gender,Age,EstimatedSalary
0,0.0,48,138000
1,0.0,33,113000
2,0.0,53,104000
3,1.0,39,106000
4,0.0,27,31000
...,...,...,...
315,1.0,35,55000
316,0.0,53,82000
317,1.0,38,71000
318,1.0,35,50000


In [14]:
y

0      1
1      0
2      1
3      1
4      0
      ..
315    0
316    1
317    0
318    0
319    1
Name: Purchased, Length: 320, dtype: int64

In [15]:
test_df.head(1)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15657163,Male,35,58000


In [16]:
test_df=test_df.drop(columns='User ID',axis=1)
test_df.head(1)

Unnamed: 0,Gender,Age,EstimatedSalary
0,Male,35,58000


In [17]:
test_df['Gender']=oh.transform(test_df[['Gender']])
test_df.head(1)

Unnamed: 0,Gender,Age,EstimatedSalary
0,1.0,35,58000


In [18]:
train_df['Purchased'].value_counts(normalize=True)

0    0.64375
1    0.35625
Name: Purchased, dtype: float64

In [19]:
from sklearn.ensemble import RandomForestClassifier
rfg=RandomForestClassifier()

In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
rfg.fit(x_train,y_train)
y_pred=rfg.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8875

# #Using random search CV

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
n_estimators=[int(x) for x in np.linspace(50,1000,10)]
max_features=[0.2,0.6,1.0]
max_depth=[2,6,8,None]
bootstrap=[True,False]
max_samples=[0.5,0.75,1.0]

In [23]:
param_grid={'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
            'bootstrap':bootstrap,
            'max_samples':max_samples

         }

In [24]:
print(param_grid)

{'n_estimators': [50, 155, 261, 366, 472, 577, 683, 788, 894, 1000], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 6, 8, None], 'bootstrap': [True, False], 'max_samples': [0.5, 0.75, 1.0]}


In [25]:
random_grid=RandomizedSearchCV(estimator=rfg,param_distributions=param_grid,cv=3,verbose=2,n_jobs=-1)

In [26]:
import warnings
warnings.filterwarnings('ignore')
random_grid.fit(x_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   10.2s finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 6, 8, None],
                                        'max_features': [0.2, 0.6, 1.0],
                                        'max_samples': [0.5, 0.75, 1.0],
                                        'n_estimators': [50, 155, 261, 366, 472,
                                                         577, 683, 788, 894,
                                                         1000]},
                   verbose=2)

In [27]:
random_grid.best_params_

{'n_estimators': 577,
 'max_samples': 0.5,
 'max_features': 0.2,
 'max_depth': 2,
 'bootstrap': False}

In [28]:
random_grid.best_score_

0.8875000000000001

In [29]:
best_random_grid=random_grid.best_estimator_
best_random_grid

RandomForestClassifier(bootstrap=False, max_depth=2, max_features=0.2,
                       max_samples=0.5, n_estimators=577)

In [30]:
y_pred1=best_random_grid.predict(x_test)
accuracy_score(y_test,y_pred1)

0.9125

# #Using Grid search CV

In [31]:
np.arange(5,500,50)

array([  5,  55, 105, 155, 205, 255, 305, 355, 405, 455])

In [32]:
from sklearn.model_selection import GridSearchCV
n_estimators=[int(x) for x in np.linspace(5,300,6)]
max_features=[0.2,0.4]
max_samples=[0.50,0.70]
max_depth=[int(x) for x in np.linspace(1,10,3)]

In [33]:
param_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_samples':max_samples,
    'max_depth':max_depth
        }

In [34]:
print(param_grid)

{'n_estimators': [5, 64, 123, 182, 241, 300], 'max_features': [0.2, 0.4], 'max_samples': [0.5, 0.7], 'max_depth': [1, 5, 10]}


In [35]:
rfg_grid=GridSearchCV(estimator=rfg,param_grid=param_grid,cv=3,verbose=2)

In [36]:
rfg_grid.fit(x_train,y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=5 ..
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=5, total=   0.0s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=5 ..
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=5, total=   0.0s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=5 ..
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=5, total=   0.0s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=64 .
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=64, total=   0.1s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=64 .


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=64, total=   0.1s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=64 .
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=64, total=   0.1s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=123 
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=123, total=   0.2s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=123 
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=123, total=   0.2s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=123 
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=123, total=   0.2s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=182 
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=182, total=   0.3s
[CV] max_depth=1, max_features=0.2, max_samples=0.5, n_estimators=182 
[CV]  max_depth=1, max_features=0.2, max_samples=0.5, n_esti

[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:  1.0min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 5, 10], 'max_features': [0.2, 0.4],
                         'max_samples': [0.5, 0.7],
                         'n_estimators': [5, 64, 123, 182, 241, 300]},
             verbose=2)

In [37]:
rfg_grid.best_params_

{'max_depth': 5, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 64}

In [38]:
rfg_grid.best_score_

0.8916666666666666

In [39]:
best_grid=rfg_grid.best_estimator_
best_grid

RandomForestClassifier(max_depth=5, max_features=0.2, max_samples=0.5,
                       n_estimators=64)

In [40]:
y_pred2=best_grid.predict(x_test)
accuracy_score(y_test,y_pred2)

0.875

In [41]:
best_grid.fit(x,y)

RandomForestClassifier(max_depth=5, max_features=0.2, max_samples=0.5,
                       n_estimators=64)

In [42]:
y_pred3=best_grid.predict(test_df)
y_pred3

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1])

In [43]:
sample_df.head(1)

Unnamed: 0,User ID,Purchased
0,15657163,0


In [44]:
submission=pd.DataFrame({'User ID':sample_df['User ID'],'Purchased':y_pred3})

In [45]:
submission.head()

Unnamed: 0,User ID,Purchased
0,15657163,0
1,15577514,1
2,15807909,0
3,15646091,1
4,15730688,0


In [46]:
submission.to_csv('sub.csv',index=None)