In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [3]:
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
# Fitting Kernel SVM to the Training set
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [7]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.92

In [8]:
# Check Overfitting and Underfitting

In [9]:
print("Training Score", rfc.score(X_train, y_train))
print("Testing Score", rfc.score(X_test, y_test))

Training Score 1.0
Testing Score 0.92


In [10]:
# Fine Tune the Model

In [10]:
rfc_1 = RandomForestClassifier(n_estimators= 10,criterion='gini',max_depth=None, n_jobs=-1, max_features='auto', random_state=123)
rfc_1.fit(X_train, y_train)
print("Training Score", rfc_1.score(X_train, y_train))
print("Testing Score", rfc_1.score(X_test, y_test))

Training Score 0.9866666666666667
Testing Score 0.92


In [11]:
rfc_2 = RandomForestClassifier(n_estimators= 100,criterion='entropy',max_depth=10, n_jobs=-1, max_features='sqrt', random_state=123)
rfc_2.fit(X_train, y_train)
print("Training Score", rfc_2.score(X_train, y_train))
print("Testing Score", rfc_2.score(X_test, y_test))

Training Score 0.9966666666666667
Testing Score 0.93


In [12]:
rfc_2 = RandomForestClassifier(n_estimators= 1000,criterion='entropy',max_depth=10, n_jobs=-1, max_features='sqrt', random_state=123)
rfc_2.fit(X_train, y_train)
print("Training Score", rfc_2.score(X_train, y_train))
print("Testing Score", rfc_2.score(X_test, y_test))

Training Score 0.9966666666666667
Testing Score 0.93


In [13]:
rfc_2 = RandomForestClassifier(n_estimators= 200,criterion='entropy',max_depth=20, n_jobs=-1, max_features='sqrt', random_state=123)
rfc_2.fit(X_train, y_train)
print("Training Score", rfc_2.score(X_train, y_train))
print("Testing Score", rfc_2.score(X_test, y_test))

Training Score 1.0
Testing Score 0.93


In [14]:
rfc_2 = RandomForestClassifier(n_estimators= 200,criterion='entropy',max_depth=30, n_jobs=-1, max_features='sqrt', random_state=123)
rfc_2.fit(X_train, y_train)
print("Training Score", rfc_2.score(X_train, y_train))
print("Testing Score", rfc_2.score(X_test, y_test))

Training Score 1.0
Testing Score 0.93


In [15]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
print("Mean Accuracy",accuracies.mean())
print("Standard Deviation", accuracies.std())

Mean Accuracy 0.89
Standard Deviation 0.0650640709864771


In [16]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
#parameters = {'n_estimator': [1, 10, 100, 1000], 'criterion': ['gini', 'entropy'], 'max_depth': [10,20,30]}

In [17]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [21]:
params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [22]:
params

{'n_estimators': [10, 40, 70, 100],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [19]:
grid_search = GridSearchCV(estimator = rfc, param_grid = params, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [27]:
# Without Cross Validation

In [23]:
grid_search = GridSearchCV(estimator = rfc, param_grid = params, n_jobs = -1, verbose = 2)

In [24]:
grid_search = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


In [25]:
best_accuracy = grid_search.best_score_
best_accuracy

0.9099999999999999

In [26]:
best_parameters = grid_search.best_params_
best_parameters


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 10}

In [28]:
import pickle

In [31]:
with open('bestparams.pkl', 'wb') as f:
    pickle.dump(best_parameters, f)