In [3]:
import pandas as pd
from IPython.display import display
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
np.random.seed(903949505)
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

In [6]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    END = '\033[0m'

In [19]:
def eda(batmobile):
    print(color.BOLD + color.UNDERLINE + color.GREEN + "Shape" + color.END)
    print("Number of columns are " + color.BOLD + str(batmobile.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(batmobile.shape[1]) + color.END + "\n")
    print(color.BOLD + color.UNDERLINE + color.PURPLE + "Data types" + color.END)
    display(batmobile.dtypes)
    print(color.BOLD + color.UNDERLINE + color.DARKCYAN + "Sample rows - Top 5" + color.END)
    display(batmobile.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.WARNING + "EDA Statistics" + color.END)
    display(batmobile.describe())
    nan_count = batmobile.isna().sum()
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "Missing values" + color.END)
    display(nan_count[nan_count > 0])
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Count of Outliers" + color.END)
    Q1 = batmobile.quantile(0.25)
    Q3 = batmobile.quantile(0.75)
    IQR = Q3 - Q1
    Outliers = ((batmobile < (Q1 - 3 * IQR)) | (batmobile > (Q3 + 3 * IQR))).sum()
    display(Outliers[Outliers>0])

In [14]:
#split train and test dataset using sklearn
def train_test(vegito):
    goku, vegita = train_test_split(vegito, test_size=0.2, random_state=903949505)
    return goku, vegita

In [24]:
#stratified sampling in sklearn
def stratified_sampling(vegito, target, size = 0.4, OOT = 0.5, seed = 903949505):
    
    goku, vegita = train_test_split(vegito, test_size=size, stratify=vegito[target], random_state=seed)
    vegita, cell = train_test_split(vegita, test_size=OOT, stratify=vegita[target], random_state=seed)
    print(color.BOLD + color.UNDERLINE + color.DARKCYAN + "Original distribution" + color.END)
    c = vegito[target].value_counts(normalize=False)
    p = vegito[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.WARNING + "Train distribution" + color.END)
    c = goku[target].value_counts(normalize=False)
    p = goku[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "Test distribution" + color.END)
    c = vegita[target].value_counts(normalize=False)
    p = vegita[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "OOT distribution" + color.END)
    c = cell[target].value_counts(normalize=False)
    p = cell[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    
    #split goku, vegita and cell by target variable to make depent and indepent data seperately
    goku_X = goku.drop(target, axis=1)
    vegita_X = vegita.drop(target, axis=1)
    cell_X = cell.drop(target, axis=1)
    goku_y = pd.DataFrame(goku[target])
    vegita_y = pd.DataFrame(vegita[target])
    cell_y = pd.DataFrame(cell[target])
    
    #return goku_X, goku_y, vegita_X, vegita_y
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_X.shape[1]) + color.END)
    display(goku_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_y.shape[1]) + color.END)
    display(goku_y.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_X.shape[1]) + color.END)
    display(vegita_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_y.shape[1]) + color.END)
    display(vegita_y.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "OOT_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(cell_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(cell_X.shape[1]) + color.END)
    display(cell_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "OOT_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(cell_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(cell_y.shape[1]) + color.END)
    display(cell_y.head())
    return goku_X, goku_y, vegita_X, vegita_y, cell_X, cell_y

In [1]:
#send train and test data in sklearn decision tree model with min_samples_leaf, and min_samples_split 
def decision_tree(X, Y, criterion= 'gini', max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=903949505):
    model = DecisionTreeClassifier(random_state=random_state, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    model.fit(X, Y)
    return model

In [2]:
#predict using decision tree from sklearn and calculate accuracy and F1 score
def predict(model, X, Y):
    predictions = model.predict(X)
    predictions = pd.DataFrame(predictions, index=X.index)
    accuracy = accuracy_score(Y, predictions)
    f1 = f1_score(Y, predictions)
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    return predictions, accuracy, f1

In [None]:
#Build knn on X and Y with distnace and n_neighbors
def knn(X, Y, distance='euclidean', n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors, metric=distance)
    model.fit(X, Y)
    return model

In [None]:
#Build a gradient boosting from sklearn
def gradient_boosting(X, Y, learning_rate=0.1, n_estimators=10000, subsample=1.0, max_depth=6):
    model = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
    model.fit(X, Y)
    return model

In [None]:
import xgboost as xgb
def xgboost(X, Y, learning_rate=0.1, n_estimators=10000, subsample=1.0, max_depth=6):
    xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=903949505, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth, nthread=-1)
    xgb_model.fit(X, Y)
    return xgb_model