In [2]:
import numpy as np, pandas as pd, seaborn as sns

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [7]:
df = pd.read_csv('ads_data.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [8]:
df.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [10]:
df['User ID']

0      15624510
1      15810944
2      15668575
3      15603246
4      15804002
         ...   
395    15691863
396    15706071
397    15654296
398    15755018
399    15594041
Name: User ID, Length: 400, dtype: int64

In [11]:
df['Gender'].value_counts()

Gender
Female    204
Male      196
Name: count, dtype: int64

In [12]:
df.Gender

0        Male
1        Male
2      Female
3      Female
4        Male
        ...  
395    Female
396      Male
397    Female
398      Male
399    Female
Name: Gender, Length: 400, dtype: object

In [14]:
pd.set_option('future.no_silent_downcasting', True)

In [15]:
df['Gender'].replace({'Male' :0, 'Female':1}, inplace=True)

In [16]:
df.drop('User ID', axis=1, inplace=True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Gender           400 non-null    int64
 1   Age              400 non-null    int64
 2   EstimatedSalary  400 non-null    int64
 3   Purchased        400 non-null    int64
dtypes: int64(4)
memory usage: 12.6 KB


In [23]:
df.describe()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,0.51,37.655,69742.5,0.3575
std,0.500526,10.482877,34096.960282,0.479864
min,0.0,18.0,15000.0,0.0
25%,0.0,29.75,43000.0,0.0
50%,1.0,37.0,70000.0,0.0
75%,1.0,46.0,88000.0,1.0
max,1.0,60.0,150000.0,1.0


### RF Classifier

In [28]:
X = df.drop('Purchased', axis=1)
y = df['Purchased']

In [29]:
X.head()

Unnamed: 0,Gender,Age,EstimatedSalary
0,0,19,19000
1,0,35,20000
2,1,26,43000
3,1,27,57000
4,0,19,76000


In [30]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

#### Train Random Forest Classifier

In [44]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

In [45]:
#Test Data Accuarcy
y_pred_test = rf_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print('Testing Data Accuarcy is', test_accuracy)

Testing Data Accuarcy is 0.9083333333333333


In [46]:
#Train Data Accuarcy
y_pred_train = rf_clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print('Training Data Accuarcy is', train_accuracy)

Training Data Accuarcy is 0.9964285714285714


### RF Regressor

In [38]:
X = df.drop('EstimatedSalary', axis=1)
y = df['EstimatedSalary']

In [39]:
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

In [40]:
#Test Data Accuarcy
y_pred_test = rf_reg.predict(X_test)
test_accuracy = r2_score(y_test, y_pred_test)
print('Testing Data Accuarcy is', test_accuracy)

Testing Data Accuarcy is 0.7040003672398718


In [41]:
#Train Data Accuarcy
y_pred_train = rf_reg.predict(X_train)
train_accuracy = r2_score(y_train, y_pred_train)
print('Training Data Accuarcy is', train_accuracy)

Training Data Accuarcy is 0.9343836769072061


### DT Classifier

In [43]:
X = df.drop('Purchased', axis=1)
y = df['Purchased']

In [56]:
dt_clf = DecisionTreeClassifier(random_state=50, criterion='entropy', min_samples_split=5)
dt_clf.fit(X_train, y_train)

In [57]:
#Test Data Accuarcy
y_pred_test_dt = dt_clf.predict(X_test)
test_accuracy_dt = accuracy_score(y_test, y_pred_test_dt)
print('Testing Data Accuarcy is', test_accuracy_dt)

Testing Data Accuarcy is 0.8583333333333333


In [58]:
#Train Data Accuarcy
y_pred_train_dt = dt_clf.predict(X_train)
train_accuracy_dt = accuracy_score(y_train, y_pred_train_dt)
print('Training Data Accuarcy is', train_accuracy_dt)

Training Data Accuarcy is 0.9607142857142857


### GSCV

In [59]:
hyperparameters = {'criterion' : ["gini", "entropy"],
                   'max_depth' : np.arange(2,10),
                   'min_samples_split': np.arange(2,10),
                   "min_samples_leaf" : np.arange(2,10)
                  }
dt_model = DecisionTreeClassifier()
gscv_dt_model = GridSearchCV(dt_model, hyperparameters, cv=5)
gscv_dt_model.fit(X_train, y_train)

In [60]:
gscv_dt_model.best_params_

{'criterion': 'gini',
 'max_depth': 6,
 'min_samples_leaf': 3,
 'min_samples_split': 4}

In [61]:
best_model = DecisionTreeClassifier(criterion= 'gini',
                     max_depth= 6,
                     min_samples_leaf= 3,
                        min_samples_split= 4)
best_model.fit(X_train, y_train)

In [63]:
#Test Data Accuarcy
y_pred_test_gscv = best_model.predict(X_test)
test_accuracy_gscv = accuracy_score(y_test, y_pred_test_gscv)
print('Testing Data Accuarcy is', test_accuracy_gscv)

Testing Data Accuarcy is 0.8666666666666667


In [64]:
#Train Data Accuarcy
y_pred_train_gscv = best_model.predict(X_train)
train_accuracy_gscv = accuracy_score(y_train, y_pred_train_gscv)
print('Training Data Accuarcy is', train_accuracy_gscv)

Training Data Accuarcy is 0.9392857142857143
