In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, confusion_matrix, recall_score, 
                             roc_auc_score, roc_curve, classification_report, precision_score,f1_score,
                             ConfusionMatrixDisplay, RocCurveDisplay)
df = pd.read_csv('data/train.csv').drop(columns='id', axis =1)

: 

In [None]:
from sklearn.preprocessing import (OneHotEncoder,StandardScaler, OrdinalEncoder, LabelEncoder)
label_encoder = LabelEncoder()


numerical_columns   = [col for col in df.columns if df[col].dtype!='O']
categorical_columns = [col for col in df.columns if df[col].dtype=='O']

categorical_columns_ = [i for i in categorical_columns if i not in "NObeyesdad"]
output_columns =   [i for i in categorical_columns if i  in "NObeyesdad"]

df["NObeyesdad"] = label_encoder.fit_transform(df["NObeyesdad"])


independent_column = df.iloc[:,:-1]
# dependent_column   = df.iloc[:,-1]
dependent_column = df[output_columns]

In [None]:
categorical_columns

In [None]:
df.head()

In [None]:
display(independent_column.head(1),
        dependent_column.head(2))

In [None]:
from sklearn.preprocessing import (OneHotEncoder,StandardScaler, OrdinalEncoder, LabelEncoder)
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer      = OneHotEncoder(handle_unknown="ignore")
ordinal_encoder    =  OrdinalEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer([
    ("OneHotEncoder",  oh_transformer, categorical_columns_),
    ("StandardScaler", numeric_transformer, numerical_columns)    
     ])
independent_column = preprocessor.fit_transform(independent_column)

In [None]:
final_df = pd.concat([pd.DataFrame(independent_column), dependent_column], axis=1).reset_index(drop = True)

In [None]:
final_df.head(1)

In [None]:
x = final_df.iloc[:,:-1]  ## Taking all column all row except  last one(ie -1)
y = final_df.iloc[:,-1]   ## Taking all row of the last column

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.23, random_state=42)
print("-"*79)
print("x_train shape", x_train.shape)
print("y_train shape", y_train.shape)
print("x_test shape", x_test.shape)
print("y_test shape", y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import(RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier)
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
print(pd.__version__) ## iteritems was removed in pandas 2.0 - try using pandas version 1.5.3 instead.
## Or you can use this 
pd.DataFrame.iteritems = pd.DataFrame.items


In [None]:
def evaluate_model(true, predicted):
    AccuracyScore = accuracy_score(true, predicted)
    PrecisionScore = precision_score(true, predicted, average='micro')
    RecallScore   =  recall_score(true, predicted, average='micro')
    F1Score   =     f1_score(true, predicted, average='micro')
    return(AccuracyScore, PrecisionScore, RecallScore, F1Score)

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Filter warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
models = {
    "Logistic Regression": LogisticRegression(multi_class='multinomial'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "CatBoost": CatBoostClassifier(iterations=100, learning_rate=0.1, loss_function='MultiClass'),
    "XGBoost": XGBClassifier(),
    "SVM": SVC(),
    "K-Neighbors": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis()
}

accuracy_dictionary = {}

for k,model in models.items():
    print("Fitting :-->", k, "_Model")
    model.fit(x_train, y_train)
    y_train_predict = model.predict(x_train)
    y_test_predict  = model.predict(x_test)

    ### EVALUATE TRAIN AND THE TEST MODEL 
    train_accuracy_score,train_precision_score, train_recall_score,train_f1_score  = evaluate_model(y_train, y_train_predict)
    test_accuracy_score, test_precision_score, test_recall_score, test_f1_score  = evaluate_model(y_test,y_test_predict)

    print("MODEL NAME :==>", k)
    print("Model Performance On The Training Set ")
    print("- TRAIN ACCURACY SCORE :->", train_accuracy_score)
    print("- TRAIN PPRECISION SCORE :->", train_precision_score)
    print("- TRAIN RECALL SCORE :->", train_recall_score)
    print("- TRAIN F1 SCORE :->", train_f1_score)

    print("\n","-"*35)
    print("MODEL PERFORMANCE ON THE TEST DATA :->")
    print("- TEST ACCURACY SCORE :->", test_accuracy_score)
    print("- TEST PPRECISION SCORE :->", test_precision_score)
    print("- TEST RECALL SCORE :->", test_recall_score)
    print("- TEST F1 SCORE :->", test_f1_score)

    accuracy_dictionary[k] = test_accuracy_score

    print('='*65)
    print('\n')

    

In [None]:
pd.DataFrame.from_dict(accuracy_dictionary, orient='index', columns=['Accuracy']).sort_values(by="Accuracy",ascending = False)


In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import RandomizedSearchCV

# Filter warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "CatBoost": CatBoostClassifier(loss_function='MultiClass', verbose=False),
    "XGBoost": XGBClassifier(),
    "SVM": SVC(),
    "K-Neighbors": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis()
}

hyperparameters = {
    "Logistic Regression": {"C": [0.1, 1, 10]},
    "Decision Tree": {"max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Random Forest": {"n_estimators": [50, 100, 150], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Gradient Boosting": {"n_estimators": [50, 100, 150], "learning_rate": [0.01, 0.1, 0.5]},
    "AdaBoost": {"n_estimators": [50, 100, 150], "learning_rate": [0.01, 0.1, 0.5]},
    "CatBoost": {"iterations": [50, 100, 150], "learning_rate": [0.01, 0.1, 0.5]},
    "XGBoost": {"n_estimators": [50, 100, 150], "learning_rate": [0.01, 0.1, 0.5]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ['linear', 'rbf']},
    "K-Neighbors": {"n_neighbors": [3, 5, 7], "weights": ['uniform', 'distance']},
    "Gaussian Naive Bayes": {},
    "Quadratic Discriminant Analysis": {}
}

accuracy_dictionary = {}

for k, model in models.items():
    print("Tuning :-->", k)
    random_search = RandomizedSearchCV(model, hyperparameters[k], cv=5, n_iter=10)
    random_search.fit(x_train, y_train)
    best_model = random_search.best_estimator_

    print("Fitting best model on the training data...")
    best_model.fit(x_train, y_train)
    y_train_predict = best_model.predict(x_train)
    y_test_predict = best_model.predict(x_test)

    train_accuracy_score, train_precision_score, train_recall_score, train_f1_score = evaluate_model(y_train,
                                                                                                      y_train_predict)
    test_accuracy_score, test_precision_score, test_recall_score, test_f1_score = evaluate_model(y_test,
                                                                                                  y_test_predict)

    print("MODEL NAME :==>", k)
    print("Model Performance On The Training Set ")
    print("- TRAIN ACCURACY SCORE :->", train_accuracy_score)
    print("- TRAIN PPRECISION SCORE :->", train_precision_score)
    print("- TRAIN RECALL SCORE :->", train_recall_score)
    print("- TRAIN F1 SCORE :->", train_f1_score)

    print("\n", "-" * 35)
    print("MODEL PERFORMANCE ON THE TEST DATA :->")
    print("- TEST ACCURACY SCORE :->", test_accuracy_score)
    print("- TEST PPRECISION SCORE :->", test_precision_score)
    print("- TEST RECALL SCORE :->", test_recall_score)
    print("- TEST F1 SCORE :->", test_f1_score)

    accuracy_dictionary[k] = test_accuracy_score

    print('=' * 65)
    print('\n')


In [None]:
pd.DataFrame.from_dict(accuracy_dictionary, orient='index', columns=['Accuracy']).sort_values(by="Accuracy",ascending = False)
