In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

# train_df = pd.read_csv(f'{INPUT_DIR}adult.data', header = None, names = COL_NAMES)
# test_df = pd.read_csv(f'{INPUT_DIR}adult.test', header = None, names = COL_NAMES)

In [37]:

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from config import MEAN_ENCODED_COLUMN, mean_encoding, TARGET_COLUMN
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer

class CustomMeanEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, target_col = None, column_array=[], encoder = []):
        self.target_col = target_col
        self.mean_encoding = encoder
        self.column_array = column_array
        
    def fit(self, df):
        if self.mean_encoding != []:
            return self
        for i in self.column_array:
            self.mean_encoding.append(df.groupby([i])[self.target_col].mean().to_dict()[TARGET_COLUMN])   
        # for test set pipelining
        mean_encoding = self.mean_encoding
        return self
    
    def transform(self, df):
        for encodings,column in zip(self.mean_encoding,self.column_array):
            df[column+ MEAN_ENCODED_COLUMN] = df[column].apply(lambda value: encodings[value])
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self,column_array=[]):
        self.column_array = column_array
        
    def fit(self, df, y = None):
        return self
    
    def transform(self, df):
        return df.drop(self.column_array, axis = 1)


class CustomPreprocessing(BaseEstimator, TransformerMixin):
    
    # use this for manual feature engineering

    def __init__(self,drop_=True):
        pass

    def fit(self, df, y = None):
        return self
    
    def transform(self, df):
        return self

    
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self,column_array=[], column_array_drop_first=[]):
        self.column_array = column_array
        self.column_array_drop_first = column_array_drop_first
    
    def fit(self, df):
        return self
    
    def transform(self,df):
        df = pd.get_dummies(df, columns = self.column_array)
        if self.column_array_drop_first != [] :
            df = pd.get_dummies(df, columns = self.column_array_drop_first, drop_first = True)
        return df

class CustomNormalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, ntype = "sc", column_array = [],*args,**kwargs):
        self.column_array = column_array
        self.ntype = ntype
        if self.ntype == "pt":
            self.cn = PowerTransformer(*args,**kwargs)
        elif self.ntype == "mm":
            self.cn = MinMaxScaler()
        else:
            self.cn = StandardScaler()
    
    def fit(self, df):
        self.cn.fit(df[self.column_array])
        return self
        
    def transform(self, df):
        df[self.column_array] = self.cn.transform(df[self.column_array])
        return df

In [4]:
# train_raw_report = ProfileReport(test_df, explorative = True) 
# train_raw_report.to_file("train_raw_report.html")
# test_raw_report = ProfileReport(test_df, explorative = True) 
# test_raw_report.to_file("test_raw_report.html")

In [6]:
concat_df = pd.concat([train_df,test_df])
concat_df.drop(['education','capital-gain','capital-loss'],1, inplace = True)
concat_df = concat_df.replace([' ?'],value = np.nan).dropna()

concat_df = pd.get_dummies(concat_df, drop_first = False, columns = CAT_COL)
concat_df['is_test'] = concat_df['50k'].apply(lambda x : 1 if x in [' <=50K.', ' >50K.'] else 0)
concat_df['50k'] = concat_df['50k'].apply(lambda x: ' >50K' if x in [' >50K', ' >50K.'] else ' <=50K')

concat_df = pd.get_dummies(concat_df, drop_first = True, columns = ['50k'])

In [7]:
concat_df.reset_index(drop = True, inplace = True)

In [8]:
concat_copy = concat_df.copy()

In [22]:
standardize_list = list(concat_copy.columns)
standardize_list.remove('is_test')
standardize_list.remove('50k_ >50K')

In [23]:
from sklearn.preprocessing import StandardScaler

autoscaler = StandardScaler()
concat_copy[standardize_list] = autoscaler.fit_transform(concat_copy[standardize_list])

In [24]:
concat_copy

Unnamed: 0,age,fnlwgt,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,is_test,50k_ >50K
0,0.034201,-1.062295,-0.078120,-0.179133,-0.271285,-1.671940,-0.194353,-0.302710,4.715765,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,0,0
1,0.866417,-1.007438,-2.326738,-0.179133,-0.271285,-1.671940,-0.194353,3.303493,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,0,0
2,-0.041455,0.245284,-0.078120,-0.179133,-0.271285,0.598108,-0.194353,-0.302710,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,0,0
3,1.093385,0.425853,-0.078120,-0.179133,-0.271285,0.598108,-0.194353,-0.302710,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,0,0
4,-0.798015,1.407393,-0.078120,-0.179133,-0.271285,0.598108,-0.194353,-0.302710,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,-3.241430,-0.042881,-0.022558,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,-0.419735,0.525154,-0.078120,-0.179133,-0.271285,0.598108,-0.194353,-0.302710,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,1,0
45218,0.034201,0.243135,-0.411249,-0.179133,-0.271285,0.598108,-0.194353,-0.302710,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,1,0
45219,-0.041455,1.753613,0.754701,-0.179133,-0.271285,0.598108,-0.194353,-0.302710,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,1,0
45220,0.412481,-1.001947,-0.078120,-0.179133,-0.271285,0.598108,-0.194353,-0.302710,-0.212055,-0.021554,...,-0.021035,-0.047312,-0.034896,-0.025332,-0.023985,0.308506,-0.042881,-0.022558,1,0


In [26]:
TARGET_COL

'50k_ >50K'

In [27]:
TEST_IND = 30162

In [54]:
from sklearn.model_selection import StratifiedKFold, train_test_split
X_train, X_test, y_train, y_test = train_test_split(ddf.drop('target',1),ddf.target.values,test_size =0.26)

In [55]:
from sklearn.linear_model import LogisticRegression

In [56]:
X = np.array(X_train)

In [57]:
y = np.array(y_train)

In [58]:
from sklearn.svm import SVC

In [59]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [60]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
fold_no = 0
for train_index, test_index in skf.split(X, y):
    X_samp_train, X_samp_test = X[train_index], X[test_index]
    y_samp_train, y_samp_test = y[train_index], y[test_index]
    
    fold_no += 1
    print(f"Fold : {fold_no}          Model used : Logistic Regression")
    
    lc = LogisticRegression(max_iter = 20000, class_weight = "balanced").fit(X_samp_train,y_samp_train)
    
    print("Precision : {}".format(precision_score(y_samp_test, lc.predict(X_samp_test))))
    print("Recall : {}".format(recall_score(y_samp_test, lc.predict(X_samp_test))))
    print("f1_score : {}".format(f1_score(y_samp_test, lc.predict(X_samp_test))))
    array_ = confusion_matrix(y_samp_test, lc.predict(X_samp_test))
    print(array_)
#     df_cm = pd.DataFrame(array_, index = [i for i in ['1','0']], columns = [i for i in ['1','0']])
#     plt.figure(figsize = (10,7))
    #sn.heatmap(df_cm, annot=True)
    #plt.show()
    print("")
    print("")

Fold : 1          Model used : Logistic Regression
Precision : 0.566301703163017
Recall : 0.8357271095152603
f1_score : 0.6751269035532995
[[2637  713]
 [ 183  931]]


Fold : 2          Model used : Logistic Regression
Precision : 0.558600583090379
Recall : 0.8591928251121076
f1_score : 0.6770318021201414
[[2592  757]
 [ 157  958]]


Fold : 3          Model used : Logistic Regression
Precision : 0.5595947556615017
Recall : 0.842152466367713
f1_score : 0.6723952738990332
[[2610  739]
 [ 176  939]]


Fold : 4          Model used : Logistic Regression
Precision : 0.5631358467983244
Recall : 0.8439461883408071
f1_score : 0.6755204594400575
[[2619  730]
 [ 174  941]]


Fold : 5          Model used : Logistic Regression
Precision : 0.5357554786620531
Recall : 0.8339317773788151
f1_score : 0.6523876404494382
[[2544  805]
 [ 185  929]]




In [50]:
ddf = pd.read_csv('../input/processed/train.csv')

from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("cn", CustomNormalizer(column_array = ['age', 'fnlwgt', 'hours-per-week'])),
    ("cme", CustomMeanEncoder(['target'], ['marital-status','education-num','occupation',
                                                    'workclass','relationship','sex','race','native-country'])),
    ("drop", DropColumns(['education', 'capital-gain', 'capital-loss']) ),
    ("ohe", CustomOneHotEncoder(['marital-status','education-num','occupation',
                                                    'workclass','relationship','race','native-country'], ['sex']))])
ddf = pipe.fit_transform(ddf)

Index(['age', 'fnlwgt', 'hours-per-week', 'target', 'marital-status_mean_enc',
       'marital-status_ Divorced', 'marital-status_ Married-AF-spouse',
       'marital-status_ Married-civ-spouse',
       'marital-status_ Married-spouse-absent',
       'marital-status_ Never-married',
       ...
       'native-country_ Puerto-Rico', 'native-country_ Scotland',
       'native-country_ South', 'native-country_ Taiwan',
       'native-country_ Thailand', 'native-country_ Trinadad&Tobago',
       'native-country_ United-States', 'native-country_ Vietnam',
       'native-country_ Yugoslavia', 'sex_ Male'],
      dtype='object', length=102)