In [3]:
import pandas as pd
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import missingno as msno
import seaborn as sns
import time
import os
import gc

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.svm import SVC

import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC

os.environ["TF_NUM_INTEROP_THREADS"] = "16"  # Set the number of inter-op threads
os.environ["TF_NUM_INTRAOP_THREADS"] = "16"  # Set the number of intra-op threads

np.random.seed(903949505)
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

In [2]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    END = '\033[0m'

In [3]:
def eda(batmobile):
    gc.collect()
    print(color.BOLD + color.UNDERLINE + color.GREEN + "Shape" + color.END)
    print("Number of columns are " + color.BOLD + str(batmobile.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(batmobile.shape[1]) + color.END + "\n")
    print(color.BOLD + color.UNDERLINE + color.PURPLE + "Data types" + color.END)
    display(batmobile.dtypes)
    print(color.BOLD + color.UNDERLINE + color.DARKCYAN + "Sample rows - Top 5" + color.END)
    display(batmobile.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.WARNING + "EDA Statistics" + color.END)
    display(batmobile.describe())
    nan_count = batmobile.isna().sum()
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "Missing values" + color.END)
    display(nan_count[nan_count > 0])
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Count of Outliers" + color.END)
    Q1 = batmobile.quantile(0.25)
    Q3 = batmobile.quantile(0.75)
    IQR = Q3 - Q1
    Outliers = ((batmobile < (Q1 - 3 * IQR)) | (batmobile > (Q3 + 3 * IQR))).sum()
    display(Outliers[Outliers>0])
    #check for null/missing values
    batmobile.info()
    msno.bar(batmobile)
    sns.set(rc={'figure.figsize':(15,10)})
    sns.heatmap(batmobile.iloc[:,:-1].corr(), annot=True, cmap="YlGnBu")
    plt.show()

In [4]:
#split train and test dataset using sklearn
def train_test(vegito):
    goku, vegita = train_test_split(vegito, test_size=0.2, random_state=903949505)
    return goku, vegita

In [5]:
#stratified sampling in sklearn
def stratified_sampling(vegito, target, size = 0.4, seed = 903949505):
    gc.collect()
    goku, vegita = train_test_split(vegito, test_size=size, stratify=vegito[target], random_state=seed)
    print(color.BOLD + color.UNDERLINE + color.DARKCYAN + "Original distribution" + color.END)
    c = vegito[target].value_counts(normalize=False)
    p = vegito[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.WARNING + "Train distribution" + color.END)
    c = goku[target].value_counts(normalize=False)
    p = goku[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "Test distribution" + color.END)
    c = vegita[target].value_counts(normalize=False)
    p = vegita[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    
    #split goku, vegita and cell by target variable to make depent and indepent data seperately
    goku_X = goku.drop(target, axis=1)
    vegita_X = vegita.drop(target, axis=1)
    goku_y = pd.DataFrame(goku[target])
    vegita_y = pd.DataFrame(vegita[target])
    
    #return goku_X, goku_y, vegita_X, vegita_y
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_X.shape[1]) + color.END)
    display(goku_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_y.shape[1]) + color.END)
    display(goku_y.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_X.shape[1]) + color.END)
    display(vegita_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_y.shape[1]) + color.END)
    display(vegita_y.head())
    return goku_X, goku_y, vegita_X, vegita_y

In [6]:
#send train and test data in sklearn decision tree model with min_samples_leaf, and min_samples_split 
def decision_tree(X, Y, criterion= 'gini', max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=903949505):
    gc.collect()
    model = DecisionTreeClassifier(random_state=random_state, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    model.fit(X, Y)
    return model

In [7]:
#predict using decision tree from sklearn and calculate accuracy and F1 score
def predict(model, X, Y):
    gc.collect()
    predictions = model.predict(X)
    predictions = pd.DataFrame(predictions, index=X.index)
    accuracy = accuracy_score(Y, predictions)
    f1 = f1_score(Y, predictions)
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    return predictions, accuracy, f1

In [8]:
#Build knn on X and Y with distnace and n_neighbors
def knn(X, Y, distance='euclidean', n_neighbors=5):
    gc.collect()
    model = KNeighborsClassifier(n_neighbors=n_neighbors, metric=distance)
    model.fit(X, Y)
    return model

In [9]:
#Build a gradient boosting from sklearn
def gradient_boosting(X, Y, learning_rate=0.1, n_estimators=10000, subsample=1.0, max_depth=6):
    gc.collect()
    model = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
    model.fit(X, Y)
    return model

In [10]:
def xgboost(X, Y, learning_rate=0.1, n_estimators=10000, subsample=1.0, max_depth=6):
    gc.collect()
    xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=903949505, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth, nthread=-1)
    xgb_model.fit(X, Y)
    return xgb_model

In [11]:
def nnet(X, Y, learning_rate=0.001, loss='binary_crossentropy', epochs=10, batch_size=32, validation_split=0.2, patience=15):
    gc.collect()
    NN = Sequential([
        Dense(64, activation='relu', input_shape=(X.shape[1],)),
        Dense(1, activation='sigmoid')
    ])
    NN.compile(optimizer=Adam(learning_rate=learning_rate), loss=loss, metrics=[AUC()])
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    NN.fit(X, Y, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[early_stopping])
    return NN

In [12]:
def learning_curve_self(h1, X_train, y_train, classifier, title = "Learning Curve (Decision Tree)", train_sizes=np.linspace(0.001, 0.04, 10), cv=5):
    # Choose Decision Tree classifier
    gc.collect()
    header = 24
    other_title = 18

    # Define the sample sizes you want to use for the learning curve
    f1_scorer = make_scorer(f1_score)
    train_sizes, train_scores, test_scores = learning_curve(classifier, X_train, y_train, train_sizes=train_sizes, scoring='accuracy', cv=cv, n_jobs=-1, verbose=3, random_state=903949505)

    # Calculate the mean and standard deviation of training and testing scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the learning curve
    plt.clf()
    plt.gcf().set_facecolor('white')
    fig, ax = plt.subplots()
    fig.set_facecolor('white')
    ax.set_facecolor('white')
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy', alpha=0.8)
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')

    plt.plot(train_sizes, test_mean, color='green', marker='s', markersize=5, label='Testing Accuracy', alpha=0.8)
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')

    plt.xlabel('Number of Training Samples', fontsize=other_title)
    plt.ylabel('Accuracy', fontsize=other_title)
    plt.title(title, fontsize=header)
    plt.legend(loc='lower right', frameon=True, edgecolor='black', facecolor='white', fontsize=other_title)
    plt.tick_params(direction='in', bottom=True, top=True, left=True, right=True, length=4)
    plt.gca().spines['top'].set_color('black')
    plt.gca().spines['bottom'].set_color('black')
    plt.gca().spines['left'].set_color('black')
    plt.gca().spines['right'].set_color('black')
    spine_alpha = 0.5
    plt.gca().spines['top'].set_alpha(spine_alpha)
    plt.gca().spines['bottom'].set_alpha(spine_alpha)
    plt.gca().spines['left'].set_alpha(spine_alpha)
    plt.gca().spines['right'].set_alpha(spine_alpha)
    plt.grid(True, linestyle="dotted", alpha=0.45, color='black')
    plt.xticks(fontsize=other_title)
    plt.yticks(fontsize=other_title)
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
    plt.savefig(h1)
    plt.figure(figsize=(8,6))
    plt.show()

In [None]:
def grid_search(Xtrain, Xtest, Ytrain, Ytest, param_grid, col1, col2, classifier,
                excelpath,
                printpath, title, form, custom_x_values,
                Change_name=False, new_labels=['A', 'B', 'C', 'D', 'E'],
                change_ylim = False, ylim_lower = 0, ylim_upper = 1, data_type_param = float):
    gc.collect()
    header = 24
    other_title = 18
    
    # Perform Grid Search with 5-fold cross-validation
    grid_search = GridSearchCV(
        classifier,
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        return_train_score=True,
        verbose=3
    )

    # Fit the model to the training data
    grid_search.fit(Xtrain, Ytrain)

    # Extract relevant information from the results dataframe
    results_summary = pd.DataFrame(grid_search.cv_results_)

    # Extract relevant columns for the summary
    columns_to_keep = [col1, 'mean_train_score', 'mean_test_score', 'std_train_score', 'std_test_score', 'mean_fit_time', 'mean_score_time']
    results_summary = results_summary[columns_to_keep]
    
    # Rename columns for clarity
    results_summary.columns = [col2, 'Train Score', 'Test Score', 'Train Score(std)', 'Test Score(std)', 'Train Time', 'Test Time']

    # Plot the validation curve
    plt.clf()
    plt.gcf().set_facecolor('white')
    fig, ax = plt.subplots()
    fig.set_facecolor('white')
    ax.set_facecolor('white')
    
    results_summary['diff']=pd.to_numeric(results_summary['Train Score'] - results_summary['Train Score(std)'])
    results_summary['sum']=pd.to_numeric(results_summary['Train Score'] + results_summary['Train Score(std)'])
    
    plt.plot(results_summary[col2], results_summary['Train Score'], color='blue', marker='o', markersize=5, label='Training Accuracy', alpha=0.8)
    if data_type_param == float:
        plt.fill_between(results_summary[col2].values.astype(float), results_summary['diff'].values.astype(float), results_summary['sum'].values.astype(float), alpha=0.1, color='blue')
    else:
        plt.fill_between(results_summary[col2], results_summary['diff'].values.astype(int), results_summary['sum'].values.astype(int), alpha=0.1, color='blue')
    results_summary['diff']=pd.to_numeric(results_summary['Test Score'] - results_summary['Test Score(std)'])
    results_summary['sum']=pd.to_numeric(results_summary['Test Score'] + results_summary['Test Score(std)'])
    
    plt.plot(results_summary[col2], results_summary['Test Score'], color='green', marker='s', markersize=5, label='Testing Accuracy', alpha=0.8)
    if data_type_param == float:
        plt.fill_between(results_summary[col2].values.astype(float), results_summary['diff'].values.astype(float), results_summary['sum'].values.astype(float), alpha=0.1, color='green')
    else:
        plt.fill_between(results_summary[col2], results_summary['diff'].values.astype(float), results_summary['sum'].values.astype(float), alpha=0.1, color='green')
        
    plt.xlabel(col2, fontsize=other_title)
    plt.ylabel('Accuracy', fontsize=other_title)
    plt.title(title, fontsize=header)
    plt.legend(loc='lower right', frameon=True, edgecolor='black', facecolor='white', fontsize=other_title)
    plt.tick_params(direction='in', bottom=True, top=True, left=True, right=True, length=4)
    plt.gca().spines['top'].set_color('black')
    plt.gca().spines['bottom'].set_color('black')
    plt.gca().spines['left'].set_color('black')
    plt.gca().spines['right'].set_color('black')
    spine_alpha = 0.5
    plt.gca().spines['top'].set_alpha(spine_alpha)
    plt.gca().spines['bottom'].set_alpha(spine_alpha)
    plt.gca().spines['left'].set_alpha(spine_alpha)
    plt.gca().spines['right'].set_alpha(spine_alpha)
    plt.grid(True, linestyle="dotted", alpha=0.45, color='black')
    plt.xticks(fontsize=other_title)
    plt.yticks(fontsize=other_title)
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: form.format(x)))
    plt.xticks(custom_x_values)
    if Change_name:
        ax.set_xticklabels(labels=new_labels)
    if change_ylim:
        plt.ylim(ylim_lower, ylim_upper)
    plt.savefig(printpath)
    plt.figure(figsize=(8,6))
    plt.show()
    
    columns_to_keep = [col2, 'Train Score', 'Test Score', 'Train Time', 'Test Time']
    results_summary = results_summary[columns_to_keep]
    results_summary[['Train Score', 'Test Score', 'Train Time', 'Test Time']] = results_summary[['Train Score', 'Test Score', 'Train Time', 'Test Time']].round(2)
    results_summary.to_excel(excelpath, index=False)
    
    return results_summary