In [3]:
import pandas as pd
from IPython.display import display
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
np.random.seed(903949505)
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 15)

In [6]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    END = '\033[0m'

In [19]:
def eda(batmobile):
    print(color.BOLD + color.UNDERLINE + color.GREEN + "Shape" + color.END)
    print("Number of columns are " + color.BOLD + str(batmobile.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(batmobile.shape[1]) + color.END + "\n")
    print(color.BOLD + color.UNDERLINE + color.DARKCYAN + "Sample rows - Top 5" + color.END)
    display(batmobile.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.WARNING + "EDA Statistics" + color.END)
    display(batmobile.describe())
    nan_count = batmobile.isna().sum()
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "Missing values" + color.END)
    display(nan_count[nan_count > 0])
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Count of Outliers" + color.END)
    Q1 = batmobile.quantile(0.25)
    Q3 = batmobile.quantile(0.75)
    IQR = Q3 - Q1
    Outliers = ((batmobile < (Q1 - 3 * IQR)) | (batmobile > (Q3 + 3 * IQR))).sum()
    display(Outliers[Outliers>0])

In [14]:
#split train and test dataset using sklearn
def train_test(vegito):
    goku, vegita = train_test_split(vegito, test_size=0.2, random_state=903949505)
    return goku, vegita

In [24]:
#stratified sampling in sklearn
def stratified_sampling(vegito, target, size = 0.4, seed = 903949505):
    
    goku, vegita = train_test_split(vegito, test_size=size, stratify=vegito[target], random_state=seed)
    print(color.BOLD + color.UNDERLINE + color.DARKCYAN + "Original distribution" + color.END)
    c1 = vegito[target].value_counts(normalize=False)
    p1 = vegito[target].value_counts(normalize=True)*100
    display(pd.concat([c1,p1], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.WARNING + "Train distribution" + color.END)
    c2 = goku[target].value_counts(normalize=False)
    p2 = goku[target].value_counts(normalize=True)*100
    display(pd.concat([c2,p2], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "Test distribution" + color.END)
    c2 = vegita[target].value_counts(normalize=False)
    p2 = vegita[target].value_counts(normalize=True)*100
    display(pd.concat([c2,p2], axis=1, keys=['counts', '%']))
    
    #split goku and vegita by target variable to make depent and indepent data seperately
    goku_X = goku.drop(target, axis=1)
    vegita_X = vegita.drop(target, axis=1)
    goku_y = pd.DataFrame(goku[target])
    vegita_y = pd.DataFrame(vegita[target])
    
    #return goku_X, goku_y, vegita_X, vegita_y
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_X.shape[1]) + color.END)
    display(goku_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_y.shape[1]) + color.END)
    display(goku_y.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_X.shape[1]) + color.END)
    display(vegita_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_y.shape[1]) + color.END)
    display(vegita_y.head())
    return goku_X, goku_y, vegita_X, vegita_y

In [None]:
#send train and test data in sklearn decision tree model
def decision_tree(goku_X, goku_y, vegita_X, vegita_y):
    model = DecisionTreeClassifier()
    model.fit(goku_X, goku_y)
    return model