### IMPORTING MODULES

In [None]:
# Basic data science pakages
import numpy as np ### For numerical computation
import pandas as pd ### For working with data

# For Creating visualizations 
%matplotlib inline
from matplotlib import pyplot as plt 
import seaborn as sns

# To handle imbalanced data
from imblearn.over_sampling import SMOTE

# For creating training and test set
from sklearn.model_selection import train_test_split

# For column transformation
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer, make_column_selector

# To make pipeline (or automate all the model creation works)
from sklearn.pipeline import make_pipeline

# Machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# For Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# For evaluating model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

# Remove all kinds of warning
from warnings import filterwarnings 
filterwarnings('ignore')

In [None]:
# Type of graph that we want
plt.style.use('seaborn-whitegrid')

In [None]:
# Maximum number of columns and rows that it will show
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 10)

### LOADING THE DATA

In [None]:
# Loading the dataset
df = pd.read_csv('../input/income-adult/adult_data.csv')

In [None]:
# Showing the data
df.head()

In [None]:
# Shape of the data
df.shape

In [None]:
# Stripping the unnecessary spaces in the names of columns
df.columns = df.columns.str.strip()

In [None]:
# Total Description of the data
df.describe()

In [None]:
# Total information of data
df.info()

In [None]:
# Stripping out unnecessary spaces in the values of categorical columns
for i in df.columns:
    if df[i].dtype == object:
        df[i] = df[i].str.strip()

In [None]:
# Looking into each columns individually
for i in df.columns:
    print('*' * 100)
    print('{}:- {}\n{}\n'.format(i, df[i].nunique(), df[i].unique()))
    print(pd.DataFrame({'count': df[i].value_counts(), '%': df[i].value_counts(normalize = True)}))
    print('/' * 100, '\n\n')

In [None]:
# Depict same information for education (so, removing it)
df.drop('education-num', axis = 1, inplace = True)

In [None]:
# Distribution of values in columns
for i in df.columns:
    print(i)
    try: 
        df[i].plot.hist(bins = 30)
        plt.show()
    except:
        plt.barh(df[i].value_counts().index, df[i].value_counts().values)
        plt.show()

### HANDLING MISSING VALUES (?)
In this dataset, we don't have missing values in the form of NaN values but in the form of a string values, i.e; **'?'**.

In [None]:
# Finding which columns have the missing values
missing_val = []
for i in df.columns:
    if ('?' in df[i].unique()):
        print(i)
        missing_val.append(i)

In [None]:
# Looking into each columns which have missing values
for i in missing_val:
    print(i, ':-')
    print('_'*20)
    print(df[i].value_counts())
    print('*'*20, '\n\n')

In [None]:
# Filling missing values of native-country
df['native-country'].replace('?', df['native-country'].value_counts().index[0], inplace = True)

In [None]:
# Column (native-country) after filling the missing values
df['native-country'].value_counts()

In [None]:
# Filling misssing values of occupation
replace_occ = df.loc[df['occupation'] != '?', 'occupation'].sample(len(df.loc[df['occupation'] == '?', 'occupation']))

replace_occ.index = df.loc[df['occupation'] == '?'].index

df.loc[df['occupation'] == '?', 'occupation'] = replace_occ

In [None]:
# Column (occupation) after filling the missing values
df['occupation'].value_counts()

In [None]:
# Filling missing values of workclass
replace_wkc = df.loc[df['workclass'] != '?', 'workclass'].sample(len(df.loc[df['workclass'] == '?', 'workclass']))

replace_wkc.index = df.loc[df['workclass'] == '?'].index

df.loc[df['workclass'] == '?', 'workclass'] = replace_wkc

In [None]:
# Column (workclass) after filling the missing values
df['workclass'].value_counts()

### HANDLING OUTLIERS

In [None]:
# It is a class which will help in checking outliers in different columns of the data
# Here, It uses three techniques to find the outliers (we can use anyone of them)
# The techniques are:-
# * IQR
# * Z score
# * Standard Deviation

class Outliers(object):
    def __init__(self, df, col):
        self.df = df
        self.col = col
        self.min = df[col].min()
        self.max = df[col].max()
        self.mean = df[col].mean()
        self.std = df[col].std()
        self.median = df[col].median()
        self.quantile_25 = df[col].quantile(0.25)
        self.quantile_75 = df[col].quantile(0.75)
    
    @property
    def info(self):
        
        print('{}:- '.format(self.col))
        print('Minimum:- {}'.format(self.min))
        print('Maximum:- {}'.format(self.max))
        print('Mean:- {}'.format(self.mean))
        print('Median:- {}'.format(self.median))
        print('Standard Deviation:- {}'.format(self.std))
        print('First Quantile:- {}'.format(self.quantile_25))
        print('Third Quantile:- {}'.format(self.quantile_75))
        
        
class IQR(Outliers):
    def __init__(self, df, col):
        super().__init__(df, col) 
        
        self.IQR = self.quantile_75 - self.quantile_25
        self.lower_bound = self.quantile_25 - (1.5 * self.IQR)
        self.upper_bound = self.quantile_75 + (1.5 * self.IQR)
        
    def iqr_outliers(self):
        
        return self.df.loc[(self.df[self.col] < self.lower_bound) | (self.df[self.col] > self.upper_bound), self.col].values
    
    def removed_outliers(self):
        return self.df.loc[(self.df[self.col] > self.lower_bound) & (self.df[self.col] < self.upper_bound)]
    

class Z_score(Outliers):
    def __init__(self, df, col):
        super().__init__(df, col)
        
        pass
    
    def z_score_outliers(self):
        outlier = []
        for i in self.df[self.col]:
            z = (i - self.mean) / self.std
            if abs(z) > 3:
                outlier.append(i)
                
        return outlier
    
    def removed_outliers(self):
        
        df_copy = self.df
        for i in self.z_score_outliers():
            df_copy = df_copy.loc[df_copy[self.col] != i]
            
        return df_copy
    
class StandardDeviation(Outliers):
    def __init__(self, df, col):
        super().__init__(df, col)
        pass
    
    @property
    def std_calc(self):
        lower_std = self.mean - (3 * self.std)
        upper_std = self.mean + (3 * self.std)
        
        return lower_std, upper_std
    
    def std_outliers(self):
        lower_std, upper_std = self.std_calc
        return self.df.loc[(self.df[self.col] < lower_std) | (self.df[self.col] > upper_std), self.col].values
    
    def removed_outliers(self):
        lower_std, upper_std = self.std_calc
        return self.df.loc[(self.df[self.col] > lower_std) & (self.df[self.col] < upper_std)]

In [None]:
# It will show the outliers of the columns

for i in df.columns:
    if df[i].dtype != object:
        
        out = Outliers(df, i)
        iqr = IQR(df, i)
        z_score = Z_score(df, i)
        std = StandardDeviation(df, i)
        
        print('Column:- {}\n'.format(i))
        print('INFO:- \n')
        out.info
        
        print('\nOutlier with IQR:- {}\n'.format(i))
        print(iqr.iqr_outliers())
        print('----------> dataset shape after removing outliers with iqr:- {}\n'.format(iqr.removed_outliers().shape))
        
        print('\nOutlier with Z_score:- {}\n'.format(i))
        print(z_score.z_score_outliers())
        print('----------> dataset shape after removing outliers with z_score:- {}\n'.format(z_score.removed_outliers().shape))
        
        print('\nOutlier with Standard deviation:- {}\n'.format(i))
        print(std.std_outliers())
        print('----------> dataset shape after removing outliers with Standard Deviation:{}\n'.format(std.removed_outliers().shape))
        print('*'*100)

In [None]:
# Here, after looking at the outliers we came to the point where we will remove outliers of
# ---> age (z_score or Standard deviation)
# ---> fnlwgt (z_score or standard deviation)
# ---> hours per week (z_score or standard deviation)
# ---> capital gain (average of different groups)
# ---> capital loss (average of different groups)

In [None]:
# Removing outliers of age, fnlwgt, hours-per-week
df = Z_score(df, 'age').removed_outliers()
df = StandardDeviation(df, 'fnlwgt').removed_outliers()
df = Z_score(df, 'hours-per-week').removed_outliers()

In [None]:
df = df.reset_index().rename({'index': 'new_index'}, axis = 1).drop('new_index', axis = 1)

In [None]:
# Working on outliers of capital-gain 1
# making the group in which we will substitute the mean values of that group
cap_gn = {}
for i in np.arange(1, 110000, 10000):
    
        cap_gn[str(i) + ' - ' + str(i + 10000)] = df.loc[(df['capital-gain'] >= i) & (df['capital-gain'] < i + 10000), 'capital-gain'].mean()
        
for i,j in cap_gn.items():
    if j is np.nan:
        cap_gn[i] = 0
        
cap_gn

In [None]:
# Working on outliers of capital-gain 2
# Substituing mean values in capital-gain
for i in range(len(df['capital-gain'])):
    for j,k in cap_gn.items(): 
        t = int(j.split(' ')[0])
        r = int(j.split(' ')[-1])
        if (df.loc[i, 'capital-gain'] >= t) & (df.loc[i, 'capital-gain'] < r):
            df.loc[i, 'capital-gain'] = k
    

In [None]:
# capital-gain after handling outliers
# Here, it still don't look good, but i'm gonna keep it (because we can't always remove outliers)
# You can change/remove if you want


df['capital-gain'].value_counts()

In [None]:
# Doing same as like the capital-loss
cap_ls = {}
for i in np.arange(1, 6000, 1000):
    
        cap_ls[str(i) + ' - ' + str(i + 1000)] = df.loc[(df['capital-loss'] >= i) & (df['capital-loss'] < i + 1000), 'capital-loss'].mean()
        
for i,j in cap_ls.items():
    if j is np.nan:
        cap_ls[i] = 0
        
cap_ls

In [None]:
# same as capital-gain
for i in range(len(df['capital-loss'])):
    for j,k in cap_ls.items(): 
        t = int(j.split(' ')[0])
        r = int(j.split(' ')[-1])
        if (df.loc[i, 'capital-loss'] >= t) & (df.loc[i, 'capital-loss'] < r):
            df.loc[i, 'capital-loss'] = k
    

In [None]:
# # Here also, it don't look good, but i'm gonna keep it (because we can't always remove outliers)
# You can change/remove outliers if you want

df['capital-loss'].value_counts()

### HANDLING IMBALANCED DATA

In [None]:
# Here, we can see that we have imbalanced data
pd.DataFrame({'count': df.salary.value_counts(), '%': df.salary.value_counts(normalize = True)})

In [None]:
# It will help in balancing imbalanced data
smote = SMOTE()

In [None]:
# The imbalanced data
df.head()

In [None]:
# Transforming columns into numerical from categorical values
# as it supports numerical and not categorical values
# And also scaling the existing numerical values 

Ordenc = OrdinalEncoder()
labenc = LabelEncoder()
scale = StandardScaler()

num_enc = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_enc = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
cat_enc_tar = ['salary']

df[cat_enc] = Ordenc.fit_transform(df[cat_enc])
df[cat_enc_tar] = labenc.fit_transform(df[cat_enc_tar])
df[num_enc] = scale.fit_transform(df[num_enc])

In [None]:
# Balancing the data
X_train, y_train = smote.fit_resample(df.loc[:, :'salary'], df['salary'])

In [None]:
# Data before balanced
df.head()

In [None]:
# Converting the balanced data into dataframe
X_train = pd.DataFrame(X_train, columns = df.columns[:-1])
y_train = pd.DataFrame(y_train, columns = ['salary'])

In [None]:
# Building the balanced dataframe 

a = pd.DataFrame(Ordenc.inverse_transform(X_train[cat_enc]), columns = cat_enc)
b = pd.DataFrame(labenc.inverse_transform(y_train), columns = ['salary'])
c = pd.DataFrame(scale.inverse_transform(X_train[num_enc]), columns = num_enc)

df = pd.merge(a, c, left_index = True, right_index = True)
df = pd.merge(df, b, left_index = True, right_index = True)

In [None]:
# Balanced data frame
df.head()

In [None]:
# Shape of the balanced dataframe
df.shape

In [None]:
# Balanced data for salary column
df.salary.value_counts()

In [None]:
# age cannot be of float type
df['age'] = df['age'].astype('int')

### EXPLORATORY DATA ANALYSIS

###### Correlation between columns

In [None]:
col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'salary']
total_col = ['workclass', 'education', 'marital-status', 'occupation',
             'relationship', 'race', 'sex', 'native-country', 'salary', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 
             'hours-per-week']

trans_col = make_column_transformer((OrdinalEncoder(), col),
                                    remainder = 'passthrough')

corr_df = trans_col.fit_transform(df)
corr_df = pd.DataFrame(corr_df, columns = total_col)
corr_df

In [None]:
plt.figure(figsize = (12, 10))

sns.heatmap(corr_df.corr(), annot = True)
plt.show()

##### Salary distribution in each age group

In [None]:
age_dis = pd.cut(df['age'], [10, 20, 30,40,50,60,70,80,90, 100])
temp_df = df.copy()
temp_df['age_dis'] = age_dis

viz_age_dis_sal = temp_df.groupby(['age_dis', 'salary'])[['salary']].count().unstack()['salary']

age_d = np.arange(len(viz_age_dis_sal.index))
width = 0.3

plt.figure(figsize = (10, 6))

plt.barh(age_d - width/2, viz_age_dis_sal.loc[:, '<=50K'], height = width, label = '<=50K', alpha = 0.75, edgecolor = 'black')
plt.barh(age_d + width/2, viz_age_dis_sal.loc[:, '>50K'], height = width, label = '>50K', alpha = 0.75, edgecolor = 'black')

plt.title('Salary distribution in each age group\n', fontsize = 25)
plt.xlabel('\nFrequency', fontsize = 20)
plt.ylabel('Age group\n', fontsize = 20)

plt.legend(frameon = True, fontsize = 15, shadow = True)
plt.yticks(np.arange(0,7), age_dis.unique().sort_values())
plt.show()

##### Popular Occupation

In [None]:
occ_gr = df['occupation'].value_counts()

plt.figure(figsize = (10, 10))

plt.pie(occ_gr.values, labels = occ_gr.index, 
        wedgeprops = {'edgecolor': 'black'}, 
        textprops = {'fontsize': 15},
        autopct = '%1.2f%%', 
        shadow = True, 
        explode = np.full(len(occ_gr), 0.05))

plt.title('Popular Occupations\n', fontsize = 35)
plt.show()

##### Salary as compared to the working hours

In [None]:
wk_hr_dis = pd.cut(df['hours-per-week'], 10)

temp_wk = df.copy()
temp_wk['work_hr_dis'] = wk_hr_dis

viz_wk_hr_sal = temp_wk.groupby(['work_hr_dis', 'salary'])[['salary']].count().unstack()['salary']

plt.figure(figsize = (15, 7))

hr_d = np.arange(len(wk_hr_dis.unique()))
width = 0.4

plt.bar(hr_d - width / 2, viz_wk_hr_sal['<=50K'], width = width, alpha = 0.75, edgecolor = 'black', label = '<=50k')
plt.bar(hr_d + width / 2, viz_wk_hr_sal['>50K'], width = width, alpha = 0.75, edgecolor = 'black', label = '>50k')

plt.title('Salary as compared to the working hours\n', fontsize = 25)
plt.xlabel('\nWorking hour range', fontsize = 20)
plt.ylabel('Frequency\n', fontsize = 20)

plt.xticks(hr_d, wk_hr_dis.unique().sort_values())
plt.legend(frameon = True, shadow = True, fontsize = 15, loc = 'best')
plt.show()

##### Work Time according to Age

In [None]:
age_wk_hr = df.groupby('age')[['hours-per-week']].aggregate(np.mean)
wk_hr_median = df['hours-per-week'].median()

plt.figure(figsize = (8, 5))

plt.plot(age_wk_hr.index, age_wk_hr.iloc[:, 0], linewidth = 5, color = 'steelblue', alpha = 0.5)

plt.fill_between(age_wk_hr.index, age_wk_hr.iloc[:, 0], wk_hr_median, 
                 where = (age_wk_hr.iloc[:, 0] > wk_hr_median),
                 interpolate= True, alpha = 0.25, color = 'green')

plt.fill_between(age_wk_hr.index, age_wk_hr.iloc[:, 0], wk_hr_median, 
                 where = (age_wk_hr.iloc[:, 0] < wk_hr_median),
                 interpolate= True, alpha = 0.25, color = 'red')


plt.axhline(wk_hr_median, color = 'red', linewidth = 2, 
            label = 'Working hour Median')

plt.title('Work Time according to Age\n', fontsize = 25)
plt.xlabel('\nAge', fontsize = 20)
plt.ylabel('Hours per Week\n', fontsize = 20)

plt.legend(loc = 'best', frameon = True, shadow = True, fontsize = 15)
plt.show()

##### Chance of earning salary more than less than 50K

In [None]:
df_temp = df.copy()

df_temp['salary_>50K'] = df_temp['salary'].map({'<=50K': 0, '>50K': 1})
df_temp['salary_<=50K'] = df_temp['salary'].map({'<=50K': 1, '>50K': 0})

age_sal = df_temp.groupby('age')[['salary_<=50K', 'salary_>50K']].mean()

plt.figure(figsize = (10, 5))

plt.plot(age_sal.index, age_sal['salary_<=50K'], label = 'Chance of earning less than 50K')
plt.plot(age_sal.index, age_sal['salary_>50K'], label = 'Chance of earning more than 50K')

plt.title('Chance of earning salary more than less than 50K\n', fontsize = 25)
plt.xlabel('\nAge', fontsize = 20)
plt.ylabel('Chance of earning salary\n', fontsize = 20)

plt.legend(loc = 'best', frameon = True, shadow = True, fontsize = 15)
plt.show()

##### Age|Work Hour|Sex|Salary

In [None]:
df_temp = df.copy()

df_temp['salary'] = df_temp['salary'].map({'<=50K': 0, '>50K': 1})

age_wkhr_sal = df_temp.groupby(['age', 'hours-per-week'])[['salary']].aggregate(np.mean).unstack().fillna(0)['salary']

plt.figure(figsize = (10, 7))

plt.scatter(df_temp['age'], df_temp['hours-per-week'], c = df_temp['sex'].map({'Female': 1, 'Male': 2}),
            s = df_temp['salary'].map({0: 1, 1: 2}) * 10, cmap = 'summer')

plt.title('Age|Work Hour|Sex|Salary\n', fontsize = 25)
plt.xlabel('\nAge', fontsize = 20)
plt.ylabel('Work Hour\n', fontsize = 20)
    
plt.colorbar()
plt.show()

##### Percent of people earning income more than 50K

In [None]:
df_temp = df.copy()

df_temp['salary'] = df_temp['salary'].map({'<=50K': 0, '>50K': 1})

ms_sal = df_temp.groupby('marital-status')[['salary']].aggregate(np.mean).sort_values(by = 'salary', ascending = True)

plt.figure(figsize = (10, 5))

plt.barh(ms_sal.index, ms_sal.iloc[:, 0], color = '#FFBAA0', edgecolor = 'black', alpha = 0.75)

plt.title('Income according to marital-status\n', fontsize = 25)
plt.xlabel('\nPercent of people earning income more than 50K', fontsize = 20)
plt.ylabel('Marital-Status\n', fontsize = 20)

plt.show()

##### Earning's more than 50K according to gender

In [None]:
df_temp = df.copy()

df_temp['salary'] = df_temp['salary'].map({'<=50K': 0, '>50K': 1})

gen_sal = df_temp.groupby('sex')[['salary']].mean()

plt.figure(figsize = (8, 10))

plt.bar(gen_sal.index, gen_sal['salary'], color = '#CCFFA0', alpha = 0.75)

plt.title('Earning\'s more than 50K\n', fontsize = 25)
plt.xlabel('\nGender', fontsize = 20)
plt.ylabel('Percentange\n', fontsize = 20)

plt.show()

##### Salary according to Occupation

In [None]:
df_temp = df.copy()

df_temp['salary'] = df['salary'].map({'<=50K': 0, '>50K': 1})

occ_sal = df_temp.groupby('occupation')[['salary']].mean().sort_values(by = 'salary', ascending = True)

plt.barh(occ_sal.index, occ_sal['salary'], alpha = 0.5, color = 'steelblue', edgecolor = 'black')

plt.title('Salary according to Occupation\n', fontsize = 25)
plt.xlabel('\nPerecent of people earning more than 50K', fontsize = 20)
plt.ylabel('Occupation\n', fontsize = 20)

plt.show()

### MODEL CREATION

In [None]:
class Tuning_types():
    grid = GridSearchCV
    random = RandomizedSearchCV
    
class Classifiers():

    # I've manually set some parameters of log because of some errors in my system
    # (you can try doing it without setting it manually)
    log = LogisticRegression(solver = 'liblinear', max_iter = 1000) 
    dt = DecisionTreeClassifier()
    rf = RandomForestClassifier()
    svc = SVC()
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    
    



class Model(object):
    
    target_col = df.columns[-1]
    test_size = 0.25
    
    def __init__(self, df):
        
        self.df = df
        self.X = df.drop(Model.target_col, axis = 1).copy()
        self.y = df[Model.target_col].copy()
        
        
    def desc_cols(self):    
        oe = []
        ohe = []
        
        for i in self.df.columns[:-1]:
            if self.df[i].dtype == object:
                if self.df[i].nunique() <= 7:
                    oe.append(i)

                else:
                    ohe.append(i)
                    
        return oe, ohe
        
        
    @property
    def train_test_set(self):
  
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, stratify = self.y, 
                                                            test_size = Model.test_size, random_state = 0)
        
        return X_train, X_test, y_train, y_test
    
    def transform_columns(self, ss = None, oe = [], ohe = []):
        
        def ss_choice():
            if ss == None:
                return make_column_selector(dtype_include = ['int', 'float'])
            else:
                return ss
            
            
        temp = ''
        
        if ((oe == []) & (ohe == [])):
            oe, ohe = self.desc_cols()
            
        elif (ohe == []) & (oe != []):
            temp, ohe = self.desc_cols()
            
        elif (oe == []) & (ohe != []):
            oe, temp = self.desc_cols()
        
        trans_col = make_column_transformer((StandardScaler(), ss_choice()), 
                                           (OneHotEncoder(handle_unknown = 'ignore', sparse = False), ohe), 
                                           (OrdinalEncoder(), oe), 
                                           remainder  = 'passthrough')
        
        return trans_col
    
    @property
    def target_col_trans_col(self):
        
        return make_column_transformer((LabelEncoder(), Model.target_col), 
                                       remainder = 'passthrough')






class Logistic_Regression(Model):
    def __init__(self, df = df):
        super().__init__(df)
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.train_test_set
        self.clf_log = Classifiers().log
        self.tune = Tuning_types()
        
    def pipe_log(self, ss = None, oe = [], ohe = []):
        pipe = make_pipeline(self.transform_columns(ss = None, oe = oe, ohe = ohe), self.clf_log)
        
        return pipe
    
    def tune_params_log(self,
                    tuning = 'grid',
                    penalty = ['l1', 'l2'],
                    dual = [False],
                    tol = [0.0001, 0.001, 0.01, 0.1], 
                    C = [1, 2, 4, 5],
                    fit_intercept = [True],
                    intercept_scaling = [1],
                    class_weight = [None],
                    random_state = [None],
                    multi_class = ['auto'],
                    verbose = [0],
                    warm_start = [False],
                    n_jobs = [None],
                    l1_ratio = [None],
                                   ):
        
        params = {
            'logisticregression__penalty': penalty,
            'logisticregression__dual': dual,
            'logisticregression__tol': tol,
            'logisticregression__C': C,
            'logisticregression__fit_intercept': fit_intercept,
            'logisticregression__intercept_scaling': intercept_scaling,
            'logisticregression__class_weight': class_weight,
            'logisticregression__random_state': random_state,
            'logisticregression__multi_class': multi_class,
            'logisticregression__verbose': verbose,
            'logisticregression__warm_start': warm_start,
            'logisticregression__n_jobs': n_jobs,
            'logisticregression__l1_ratio': l1_ratio,
        }
        
        if tuning == 'grid':
            return self.tune.grid(self.pipe_log(), params, cv = 5, verbose = 10)
        
        elif tuning == 'random':
            return self.tune.random(self.pipe_log(), params, cv = 5, verbose = 10)
        
        else:
            return "ERROR: Invalid tuning type.\nSet tuning as in ['grid', 'random'] in the parameter."

            
            
class Decision_Tree(Model):
    def __init__(self, df = df):
        super().__init__(df)
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.train_test_set
        self.clf_dt = Classifiers().dt
        self.tune = Tuning_types()
    
    
    def pipe_dt(self, ss = None, oe = [], ohe = []):
        pipe = make_pipeline(self.transform_columns(ss = None, oe = oe, ohe = ohe), self.clf_dt)
        
        return pipe
    
    def tune_params_dt(self, 
                    tuning = 'grid', 
                    criterion = ['entropy', 'gini'],
                    splitter = ['best'],
                    max_depth = [None],
                    min_samples_split = np.arange(2, 11, 2),
                    min_samples_leaf = np.arange(1,6),
                    min_weight_fraction_leaf = [0.0],
                    max_features = [None],
                    random_state = [None],
                    max_leaf_nodes = [None],
                    min_impurity_decrease = [0.0],
                    min_impurity_split = [None],
                    class_weight = [None],
                    ccp_alpha = [0.0]):
    
        params = {
            'decisiontreeclassifier__criterion': criterion,
            'decisiontreeclassifier__splitter': splitter,
            'decisiontreeclassifier__max_depth': max_depth,
            'decisiontreeclassifier__min_samples_split': min_samples_split,
            'decisiontreeclassifier__min_samples_leaf': min_samples_leaf,
            'decisiontreeclassifier__min_weight_fraction_leaf': min_weight_fraction_leaf,
            'decisiontreeclassifier__max_features': max_features,
            'decisiontreeclassifier__random_state': random_state,
            'decisiontreeclassifier__max_leaf_nodes': max_leaf_nodes,
            'decisiontreeclassifier__min_impurity_decrease': min_impurity_decrease,
            'decisiontreeclassifier__min_impurity_split': min_impurity_split,
            'decisiontreeclassifier__class_weight': class_weight,
            'decisiontreeclassifier__ccp_alpha': ccp_alpha,
        }
        
        if tuning == 'grid':
            return self.tune.grid(self.pipe_dt(), params, cv = 5, verbose = 10)
        
        elif tuning == 'random':
            return self.tune.random(self.pipe_dt(), params, cv = 5, verbose = 10)
        
        else:
            return "ERROR: Invalid tuning type.\nSet tuning as in ['grid', 'random'] in the parameter."




        
        
class Random_Forest(Model):
            

    def __init__(self, df = df):
        super().__init__(df)
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.train_test_set
        self.clf_rf = Classifiers().rf
        self.tune = Tuning_types()
    
    
    def pipe_rf(self, ss = None, oe = [], ohe = []):
        pipe = make_pipeline(self.transform_columns(ss = None, oe = oe, ohe = ohe), self.clf_rf)
        
        return pipe
    
    def tune_params_rf(self, 
                    tuning = 'grid', 
                    n_estimators = [50, 100, 150],
                    criterion = ['entropy', 'gini'],
                    max_depth = [None],
                    min_samples_split = np.arange(2, 11, 2),
                    min_samples_leaf = [1],
                    min_weight_fraction_leaf = [0.0],
                    max_features = ['auto'],
                    max_leaf_nodes = [None],
                    min_impurity_decrease = [0.0],
                    min_impurity_split = [None],
                    bootstrap = [True],
                    oob_score = [False],
                    n_jobs = [None],
                    random_state = [None],
                    verbose = [0],
                    warm_start = [False],
                    class_weight = [None],
                    ccp_alpha = [0.0],
                    max_samples = [None]
                   ):
        
        RandomForestClassifier()
        params = {
            'randomforestclassifier__n_estimators': n_estimators,
            'randomforestclassifier__criterion': criterion,
            'randomforestclassifier__max_depth': max_depth,
            'randomforestclassifier__min_samples_split': min_samples_split,
            'randomforestclassifier__min_samples_leaf': min_samples_leaf,
            'randomforestclassifier__min_weight_fraction_leaf': min_weight_fraction_leaf,
            'randomforestclassifier__max_features': max_features,
            'randomforestclassifier__max_leaf_nodes': max_leaf_nodes,
            'randomforestclassifier__min_impurity_decrease': min_impurity_decrease,
            'randomforestclassifier__min_impurity_split': min_impurity_split,
            'randomforestclassifier__bootstrap': bootstrap,
            'randomforestclassifier__oob_score': oob_score,
            'randomforestclassifier__n_jobs': n_jobs,
            'randomforestclassifier__random_state': random_state,
            'randomforestclassifier__verbose': verbose,
            'randomforestclassifier__warm_start': warm_start,
            'randomforestclassifier__class_weight': class_weight,
            'randomforestclassifier__ccp_alpha': ccp_alpha,
            'randomforestclassifier__max_samples': max_samples
        }
        
        
        if tuning == 'grid':
            return self.tune.grid(self.pipe_rf(), params, cv = 5, verbose = 10)
        
        elif tuning == 'random':
            return self.tune.random(self.pipe_rf(), params, cv = 5, verbose = 10)
        
        else:
            return "ERROR: Invalid tuning type.\nSet tuning as in ['grid', 'random'] in the parameter."
        
        
        
        
class Support_Vector_Machine(Model):
    
        
    def __init__(self, df = df):
        super().__init__(df)

        self.X_train, self.X_test, self.y_train, self.y_test = self.train_test_set
        self.clf_svc = Classifiers().svc
        self.tune = Tuning_types()
    
    
    def pipe_svc(self, ss = None, oe = [], ohe = []):
        pipe = make_pipeline(self.transform_columns(ss = None, oe = oe, ohe = ohe), self.clf_svc)
        
        return pipe
    
    def tune_params_svc(self, 
                    tuning = 'grid', 
                    C = [1.0],
                    kernel = ['linear', 'rbf', 'poly'],
                    degree = [3],
                    gamma = ['scale'],
                    coef0 = [0.0],
                    shrinking = [True],
                    probability = [False],
                    tol = [1e-3],
                    cache_size = [200],
                    class_weight = [None],
                    verbose = [False],
                    max_iter = [-1],
                    decision_function_shape = ['ovr'],
                    break_ties = [False],
                    random_state = [None]
                   ):
        
        params = {
            'svc__C': C,
            'svc__kernel': kernel,
            'svc__degree': degree,
            'svc__gamma': gamma,
            'svc__coef0': coef0,
            'svc__shrinking': shrinking,
            'svc__probability': probability,
            'svc__tol': tol,
            'svc__cache_size': cache_size,
            'svc__class_weight': class_weight,
            'svc__verbose': verbose,
            'svc__max_iter': max_iter,
            'svc__decision_function_shape': decision_function_shape,
            'svc__break_ties': break_ties,
            'svc__random_state': random_state
        }

        
        if tuning == 'grid':
            return self.tune.grid(self.pipe_svc(), params, cv = 5, verbose = 10)
        
        elif tuning == 'random':
            return self.tune.random(self.pipe_svc(), params, cv = 5, verbose = 10)
        
        else:
            return "ERROR: Invalid tuning type.\nSet tuning as in ['grid', 'random'] in the parameter."
        
        
    
class Gaussian_Naive_Bayes(Model):
    
    def __init__(self, df = df):
        super().__init__(df)

        self.X_train, self.X_test, self.y_train, self.y_test = self.train_test_set
        self.clf_gnb = Classifiers().gnb
        self.tune = Tuning_types()
    
    
    def pipe_gnb(self, ss = None, oe = [], ohe = []):
        pipe = make_pipeline(self.transform_columns(ss = None, oe = oe, ohe = ohe), self.clf_gnb)
        
        return pipe
    
    def tune_params_gnb(self, 
                    tuning = 'grid', 
                    var_smoothing = [1e-10, 1e-09, 1e-8],
                    priors = [None],
                    ):
        
        params = {
            'gaussiannb__var_smoothing': var_smoothing,
            'gaussiannb__priors': priors
        }
    
    
        if tuning == 'grid':
            return self.tune.grid(self.pipe_gnb(), params, cv = 5, verbose = 10)
        
        elif tuning == 'random':
            return self.tune.random(self.pipe_gnb(), params, cv = 5, verbose = 10)
        
        else:
            return "ERROR: Invalid tuning type.\nSet tuning as in ['grid', 'random'] in the parameter."
        
        
        
        
class KNearest_Neighbor(Model):
    
    def __init__(self, df = df):
        super().__init__(df)

        self.X_train, self.X_test, self.y_train, self.y_test = self.train_test_set
        self.clf_knn = Classifiers().knn
        self.tune = Tuning_types()
        
    def pipe_knn(self, ss = None, oe = [], ohe = []):
        pipe = make_pipeline(self.transform_columns(ss = None, oe = oe, ohe = ohe), self.clf_knn)
        
        return pipe
    
    
    def tune_params_knn(self, 
                    tuning = 'grid', 
                    n_neighbors = np.arange(5, 31, 5),
                    weights = ['uniform'],
                    algorithm = ['auto'],
                    leaf_size = [30],
                    p = [1, 2],
                    metric = ['minkowski'],
                    metric_params = [None],
                    n_jobs = [None]
                   ):
        
        params = {
            'kneighborsclassifier__n_neighbors': n_neighbors,
            'kneighborsclassifier__weights': weights,
            'kneighborsclassifier__algorithm': algorithm,
            'kneighborsclassifier__leaf_size': leaf_size,
            'kneighborsclassifier__p': p,
            'kneighborsclassifier__metric': metric,
            'kneighborsclassifier__metric_params': metric_params,
            'kneighborsclassifier__n_jobs': n_jobs
        }
        
        if tuning == 'grid':
            return self.tune.grid(self.pipe_knn(), params, cv = 5, verbose = 10)
        
        elif tuning == 'random':
            return self.tune.random(self.pipe_knn(), params, cv = 5, verbose = 10)
        
        else:
            return "ERROR: Invalid tuning type.\nSet tuning as in ['grid', 'random'] in the parameter."
        
        

class Evaluate(Logistic_Regression, 
               Decision_Tree, 
               Random_Forest, 
               Support_Vector_Machine, 
               Gaussian_Naive_Bayes, 
               KNearest_Neighbor):

    def __init__(self, df):
        super().__init__(df)
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.train_test_set


    def show(self, y_pred_nm_train, y_pred_nm_test, y_pred_tm_train, y_pred_tm_test):

        accuracy_tr_nm = accuracy_score(self.y_train, y_pred_nm_train)
        accuracy_ts_nm = accuracy_score(self.y_test, y_pred_nm_test)
        
        accuracy_tr_tm = accuracy_score(self.y_train, y_pred_tm_train)
        accuracy_ts_tm = accuracy_score(self.y_test, y_pred_tm_test)
        
        acc = pd.DataFrame({'Training Set': [accuracy_tr_nm, accuracy_tr_tm], 
                      'Testing Set': [accuracy_ts_nm, accuracy_ts_tm]},
                     index = ['Before parameter optimization', 
                              'After parameter optimization'])
        
        print('/' * 100, '\n')
        print('Acccuracy Score:- ')
        print('*' * 50)
        print(acc)
        print('*' * 50, '\n')
        
        
        # Confusion Matrix
        confusion_matrix_tr_nm = confusion_matrix(self.y_train, y_pred_nm_train)
        confusion_matrix_ts_nm = confusion_matrix(self.y_test, y_pred_nm_test)
        
        confusion_matrix_tr_tm = confusion_matrix(self.y_train, y_pred_tm_train)
        confusion_matrix_ts_tm = confusion_matrix(self.y_test, y_pred_tm_test)
        print('Confusion Matrix:- ')
        print('*' * 50)
        print('Confusion Matrix (Training Set):- ')
        print('Before parameter tuning:- \n', confusion_matrix_tr_nm)
        print('After parameter tuning:- \n', confusion_matrix_tr_tm)
        print('\n')
        print('Confusion Matrix (Testing Set):- ')
        print('Before parameter tuning:- \n', confusion_matrix_ts_nm)
        print('After parameter tuning:- \n', confusion_matrix_ts_tm)
        print('*' * 50, '\n')
        
        # Classification Report
        classification_report_tr_nm = classification_report(self.y_train, y_pred_nm_train)
        classification_report_ts_nm = classification_report(self.y_test, y_pred_nm_test)
        
        classification_report_tr_tm = classification_report(self.y_train, y_pred_tm_train)
        classification_report_ts_tm = classification_report(self.y_test, y_pred_tm_test)
        
        print('Classification Report:- ')
        print('*' * 50)
        print('Classification Report (Training Set):- ')
        print('Before parameter tuning:- \n', classification_report_tr_nm)
        print('After parameter tuning:- \n', classification_report_tr_tm)
        
        print('\n')
        print('Classification Report (Testing Set):- ')
        print('Before parameter tuning:- \n', classification_report_ts_nm)
        print('After parameter tuning:- \n', classification_report_ts_tm)
        print('*' * 50, '\n')
        
        print('/' * 100)
        


    
    @property
    def evaluate_log(self):
        
        normal_model = self.pipe_log().fit(self.X_train, self.y_train)
        tuned_model = self.tune_params_log().fit(self.X_train, self.y_train)
        
        y_pred_nm_train = normal_model.predict(self.X_train)
        y_pred_nm_test = normal_model.predict(self.X_test)
        
        y_pred_tm_train = tuned_model.predict(self.X_train)
        y_pred_tm_test = tuned_model.predict(self.X_test)

        self.show(y_pred_nm_train, y_pred_nm_test, y_pred_tm_train, y_pred_tm_test)
        
        
        
        
    @property
    def evaluate_dt(self):
        
        normal_model = self.pipe_dt().fit(self.X_train, self.y_train)
        tuned_model = self.tune_params_dt().fit(self.X_train, self.y_train)
        
        y_pred_nm_train = normal_model.predict(self.X_train)
        y_pred_nm_test = normal_model.predict(self.X_test)
        
        y_pred_tm_train = tuned_model.predict(self.X_train)
        y_pred_tm_test = tuned_model.predict(self.X_test)

        self.show(y_pred_nm_train, y_pred_nm_test, y_pred_tm_train, y_pred_tm_test)
        
        
    @property
    def evaluate_rf(self):
        
        normal_model = self.pipe_rf().fit(self.X_train, self.y_train)
        tuned_model = self.tune_params_rf().fit(self.X_train, self.y_train)
        
        y_pred_nm_train = normal_model.predict(self.X_train)
        y_pred_nm_test = normal_model.predict(self.X_test)
        
        y_pred_tm_train = tuned_model.predict(self.X_train)
        y_pred_tm_test = tuned_model.predict(self.X_test)

        self.show(y_pred_nm_train, y_pred_nm_test, y_pred_tm_train, y_pred_tm_test)
        
        
        
    @property
    def evaluate_svc(self):
        
        normal_model = self.pipe_svc().fit(self.X_train, self.y_train)
        tuned_model = self.tune_params_svc().fit(self.X_train, self.y_train)
        
        y_pred_nm_train = normal_model.predict(self.X_train)
        y_pred_nm_test = normal_model.predict(self.X_test)
        
        y_pred_tm_train = tuned_model.predict(self.X_train)
        y_pred_tm_test = tuned_model.predict(self.X_test)
        
        self.show(y_pred_nm_train, y_pred_nm_test, y_pred_tm_train, y_pred_tm_test)
        
        
        
    @property
    def evaluate_gnb(self):
        
        normal_model = self.pipe_gnb().fit(self.X_train, self.y_train)
        tuned_model = self.tune_params_gnb().fit(self.X_train, self.y_train)
        
        y_pred_nm_train = normal_model.predict(self.X_train)
        y_pred_nm_test = normal_model.predict(self.X_test)
        
        y_pred_tm_train = tuned_model.predict(self.X_train)
        y_pred_tm_test = tuned_model.predict(self.X_test)
        
        self.show(y_pred_nm_train, y_pred_nm_test, y_pred_tm_train, y_pred_tm_test)
        
    
    
    @property
    def evaluate_knn(self):
        
        normal_model = self.pipe_knn().fit(self.X_train, self.y_train)
        tuned_model = self.tune_params_knn().fit(self.X_train, self.y_train)
        
        y_pred_nm_train = normal_model.predict(self.X_train)
        y_pred_nm_test = normal_model.predict(self.X_test)
        
        y_pred_tm_train = tuned_model.predict(self.X_train)
        y_pred_tm_test = tuned_model.predict(self.X_test)
        
        self.show(y_pred_nm_train, y_pred_nm_test, y_pred_tm_train, y_pred_tm_test)
        
        
    
    # This method will evaluate all the model
    def evaluate_all(self, *models):
        
        for i in models:
            if i == 'logisticregression':
                self.evaluate_log
                
            elif i == 'decisiontreeclassifier':
                self.evaluate_dt
                
            elif i == 'randomforestclassifier':
                self.evaluate_rf
                
            elif i == 'supportvectorclassifier':
                self.evaluate_svc
                
            elif i == 'gaussiannaivebayes':
                self.evaluate_gnb
                
            elif i == 'knearestneighbor':
                self.evaluate_knn
            


In [None]:
ev = Evaluate(df)

##### 1) Logistic Regression

In [None]:
ev.evaluate_log

##### 2) Decision Tree

In [None]:
ev.evaluate_dt

##### 3) Random Forest 

In [None]:
ev.evaluate_rf

##### 4) Support Vector Classifier

In [None]:
ev.evaluate_svc

##### 5) Gaussian Naive Bayes

In [None]:
ev.evaluate_gnb

##### 6) KNearest Neighbors

In [None]:
ev.evaluate_knn

In [None]:
# You can change or set different hyperparameters (Its not necessary that mine is perfect.)
# Changing hyperparameters may give better results

# But I have kept it like this because of time and system barriers.