<a href="https://colab.research.google.com/github/shinnew99/Apziva-Projects/blob/main/Project1_3rdNote.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install lazypredict



In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import warnings

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from lazypredict.Supervised import LazyClassifier

warnings.filterwarnings('ignore')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
# Read dataset
df = pd.read_csv('/content/drive/MyDrive/Apziva/ACME-HappinessSurvey2020.csv')

data = df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6']]
target = df[['Y']]

In [5]:
# seed = random.randint(1000, 9999)
seed = 6245
print(seed)  # 6245, XGB - racll (0.88) for class 0

# run quite a few times, monitor each time to find out better seeds and whether it impacts higher perfermance on class 0, recall

6245


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)

In [7]:
# LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)

100%|██████████| 29/29 [00:01<00:00, 17.10it/s]

[LightGBM] [Info] Number of positive: 51, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.510000 -> initscore=0.040005
[LightGBM] [Info] Start training from score 0.040005
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LabelPropagation                   0.69               0.71     0.71      0.70   
LabelSpreading                     0.69               0.71     0.71      0.70   
BernoulliNB                        0.62               0.69     0.69      0.62   
LGBMClassifier                     0.58               0.66     0.66      0.58   
DecisionTreeClassifier           




In [8]:
# Recursive Feature Elimination (RFE)
def perform_rfe(model, X_train, y_train, k):
    rfe = RFE(estimator=model, n_features_to_select=k)
    fit = rfe.fit(X_train, y_train.values.ravel())
    return fit

In [9]:
# Top-k features based on RFE
rfe_model = LogisticRegression()  # make sure I'm using the same seeds for the models
fit = perform_rfe(rfe_model, X_train, y_train, 3)  # selecting top 3 features for simplicity  # I need to print out the name of the features, so that I can recommend to the companies
# print out the ranking
X_train_rfe = fit.transform(X_train)
X_test_rfe = fit.transform(X_test)

In [10]:
# Hyperparameter tuning using HyperOpt for XGBClassifier  # Also provide the same seeds  #
def hyperopt_train_test(params):
    clf = XGBClassifier(**params)
    return cross_val_score(clf, X_train_rfe, y_train.values.ravel(), scoring='recall').mean()

space4xgb = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 200, 1)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
}

In [11]:
def f(params):
    acc = hyperopt_train_test(params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space4xgb, algo=tpe.suggest, max_evals=50, trials=trials)
print('Best hyperparameters:', best)

100%|██████████| 50/50 [00:26<00:00,  1.87trial/s, best loss: -0.7090909090909092]
Best hyperparameters: {'learning_rate': 0.013107676729493896, 'max_depth': 1.0, 'n_estimators': 79.0}


In [12]:
# Train XGBClassifier with best hyperparameters
best_params = {
    'max_depth': int(best['max_depth']),
    'n_estimators': int(best['n_estimators']),
    'learning_rate': best['learning_rate']
}

In [13]:
model_xgb = XGBClassifier(**best_params)
model_xgb.fit(X_train_rfe, y_train.values.ravel())
predictions_xgb = model_xgb.predict(X_test_rfe)

In [14]:
# Evaluate XGBClassifier
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
conf_matrix_xgb = confusion_matrix(y_test, predictions_xgb)
class_report_xgb = classification_report(y_test, predictions_xgb)

In [15]:
print(f'XGBClassifier Accuracy: {accuracy_xgb}')
print('XGBClassifier Confusion Matrix:')
print(conf_matrix_xgb)
print('XGBClassifier Classification Report:')
print(class_report_xgb)

XGBClassifier Accuracy: 0.6153846153846154
XGBClassifier Confusion Matrix:
[[7 1]
 [9 9]]
XGBClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.88      0.58         8
           1       0.90      0.50      0.64        18

    accuracy                           0.62        26
   macro avg       0.67      0.69      0.61        26
weighted avg       0.76      0.62      0.62        26



In [16]:
# Stacking Classifier
estimators = [
    ('logreg', LogisticRegression(random_state=seed)),
    ('knn', KNeighborsClassifier()),
    ('xgb', XGBClassifier(**best_params))
]

In [17]:
# Stacking Classifier
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_clf.fit(X_train_rfe, y_train.values.ravel())
predictions_stack = stacking_clf.predict(X_test_rfe)

In [18]:
# Evaluate Stacking Classifier
accuracy_stack = accuracy_score(y_test, predictions_stack)
conf_matrix_stack = confusion_matrix(y_test, predictions_stack)
class_report_stack = classification_report(y_test, predictions_stack)

In [19]:
print(f'Stacking Classifier Accuracy: {accuracy_stack}')
print('Stacking Classifier Confusion Matrix:')
print(conf_matrix_stack)
print('Stacking Classifier Classification Report:')
print(class_report_stack)

Stacking Classifier Accuracy: 0.46153846153846156
Stacking Classifier Confusion Matrix:
[[ 4  4]
 [10  8]]
Stacking Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.50      0.36         8
           1       0.67      0.44      0.53        18

    accuracy                           0.46        26
   macro avg       0.48      0.47      0.45        26
weighted avg       0.55      0.46      0.48        26



In [20]:
# Voting Classifier - Soft Voting
voting_clf_soft = VotingClassifier(estimators=estimators, voting='soft')
voting_clf_soft.fit(X_train_rfe, y_train.values.ravel())
predictions_vote_soft = voting_clf_soft.predict(X_test_rfe)

In [21]:
# Evaluate Soft Voting Classifier
accuracy_vote_soft = accuracy_score(y_test, predictions_vote_soft)
conf_matrix_vote_soft = confusion_matrix(y_test, predictions_vote_soft)
class_report_vote_soft = classification_report(y_test, predictions_vote_soft)

In [22]:
print(f'Voting Classifier (Soft) Accuracy: {accuracy_vote_soft}')
print('Voting Classifier (Soft) Confusion Matrix:')
print(conf_matrix_vote_soft)
print('Voting Classifier (Soft) Classification Report:')
print(class_report_vote_soft)

Voting Classifier (Soft) Accuracy: 0.5
Voting Classifier (Soft) Confusion Matrix:
[[ 3  5]
 [ 8 10]]
Voting Classifier (Soft) Classification Report:
              precision    recall  f1-score   support

           0       0.27      0.38      0.32         8
           1       0.67      0.56      0.61        18

    accuracy                           0.50        26
   macro avg       0.47      0.47      0.46        26
weighted avg       0.55      0.50      0.52        26



In [23]:
# Voting Classifier - Soft Voting
voting_clf_hard = VotingClassifier(estimators=estimators, voting='hard')
voting_clf_hard.fit(X_train_rfe, y_train.values.ravel())
predictions_vote_hard = voting_clf_hard.predict(X_test_rfe)

In [24]:
# Evaluate Soft Voting Classifier
accuracy_vote_hard = accuracy_score(y_test, predictions_vote_hard)
conf_matrix_vote_hard = confusion_matrix(y_test, predictions_vote_hard)
class_report_vote_hard = classification_report(y_test, predictions_vote_hard)

In [25]:
print(f'Voting Classifier (Hard) Accuracy: {accuracy_vote_hard}')
print('Voting Classifier (Hard) Confusion Matrix:')
print(conf_matrix_vote_hard)
print('Voting Classifier (Hard) Classification Report:')
print(class_report_vote_hard)

Voting Classifier (Hard) Accuracy: 0.46153846153846156
Voting Classifier (Hard) Confusion Matrix:
[[ 4  4]
 [10  8]]
Voting Classifier (Hard) Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.50      0.36         8
           1       0.67      0.44      0.53        18

    accuracy                           0.46        26
   macro avg       0.48      0.47      0.45        26
weighted avg       0.55      0.46      0.48        26

