In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("../input/suv-data/suv_data.csv")
data.head()

In [None]:
# Dropping "User ID"
data.drop(columns = ["User ID"], axis = 1, inplace = True)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.nunique()

In [None]:
data["Purchased"].value_counts()

In [None]:
data.isna().sum()

In [None]:
df = data.copy()

In [None]:
df.corr()

In [None]:
# Dropping "Gender" column
df.drop(["Gender"],axis=1, inplace = True)

In [None]:
df.head()

In [None]:
# Splitting the train test data
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(["Purchased"],axis=1)
y = df["Purchased"]

In [None]:
# Scaling the data
from sklearn.preprocessing import StandardScaler

In [None]:
scale = StandardScaler()

In [None]:
X = scale.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=9)

## Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=500, random_state=9)

In [None]:
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
y_pred_tr = clf.predict(X_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_train, y_pred_tr)
precision = precision_score(y_train, y_pred_tr)
recall = recall_score(y_train, y_pred_tr)

In [None]:
print("Accuracy for train: ", accuracy)
print("Precision for train: ", precision)
print("Recall for train: ", recall)

In [None]:
confusion_matrix(y_pred, y_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [None]:
print("Accuracy for test: ", accuracy)
print("Precision for test: ", precision)
print("Recall for test: ", recall)

## Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_classifier = LogisticRegression(max_iter=500, random_state=9)

In [None]:
lr_classifier.fit(X_train, y_train)

In [None]:
y_pred_tr = clf.predict(X_train)

In [None]:
accuracy = accuracy_score(y_train, y_pred_tr)
precision = precision_score(y_train, y_pred_tr)
recall = recall_score(y_train, y_pred_tr)

In [None]:
print("Accuracy for train: ", accuracy)
print("Precision for train: ", precision)
print("Recall for train: ", recall)

In [None]:
pred_ts = lr_classifier.predict(X_test)

In [None]:
confusion_matrix(pred_ts, y_test)

In [None]:
accuracy = accuracy_score(y_test, pred_ts)
precision = precision_score(y_test, pred_ts,labels=np.unique(pred_ts))
recall = recall_score(y_test, pred_ts,labels=np.unique(pred_ts))

In [None]:
print("Accuracy for test: ", accuracy)
print("Precision for test: ", precision)
print("Recall for test: ", recall)

## XG Boost model

In [None]:
from xgboost.sklearn import XGBClassifier

In [None]:
xg_reg = XGBClassifier(use_label_encoder=False) 

In [None]:
xg_reg.fit(X_train, y_train) 

In [None]:
y_pred_xg = xg_reg.predict(X_test) 

In [None]:
confusion_matrix(y_pred_xg, y_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_xg)
precision = precision_score(y_test, y_pred_xg)
recall = recall_score(y_test, y_pred_xg)

In [None]:
print("Accuracy for test: ", accuracy)
print("Precision for test: ", precision)
print("Recall for test: ", recall)

### Hyperparameter tuning of XGBoost Classifier

In [None]:
clf = XGBClassifier(objective="binary:logistic",use_label_encoder=False)

In [None]:
booster=['gbtree']
base_score=[0.1,0.3, 0.5]

In [None]:
n_estimators = [100, 200]
max_depth = [4, 5]
learning_rate=[0.1,0.2]

In [None]:
# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'booster':booster,
    'base_score':base_score
    }

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Set up the random search with 3-fold cross validation
random_cv = RandomizedSearchCV(estimator=clf,
            param_distributions=hyperparameter_grid,
            cv=3, n_iter=10,
            scoring = 'accuracy',n_jobs = 3,
            verbose = 5, 
            return_train_score = True,
            random_state=24)

In [None]:
random_cv.fit(X_train,y_train)

In [None]:
random_cv.best_estimator_

In [None]:
classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=2, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None,use_label_encoder=False,objective="binary:logistic")

In [None]:
classifier.fit(X_train,y_train)

In [None]:
y_pred_xgb1 = classifier.predict(X_train)

In [None]:
accuracy = accuracy_score(y_train, y_pred_xgb1)
precision = precision_score(y_train, y_pred_xgb1)
recall = recall_score(y_train, y_pred_xgb1)

In [None]:
print("Accuracy for train: ", accuracy)
print("Precision for train: ", precision)
print("Recall for train: ", recall)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
confusion_matrix(y_pred, y_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [None]:
print("Accuracy for test: ", accuracy)
print("Precision for test: ", precision)
print("Recall for test: ", recall)