In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# Other Libraries
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline
from numpy import where

### Adult Dataset

#### Get Data remotely

In [2]:
# ! mkdir -p data/adult
# ! wget http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
# ! mv adult.data data/adult/adult1.csv
# ! sed  -i -e '1i"age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"' data/adult/adult1.csv

In [3]:
dataset_adult = pd.read_csv('../data/adult.csv')

In [4]:
dataset_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
dataset_adult.replace(' ?', np.NaN,inplace=True)
dataset_adult.dropna(axis=0,how='any',inplace=True)
dataset_adult = dataset_adult.drop('education', axis =1) #drop education col as its the same as education-num

In [6]:
dataset_adult.shape

(48842, 14)

In [7]:
# Create class for encoding
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
cat_col = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [8]:
dataset_adult.loc[dataset_adult["income"] == " <=50K", "income"] = 0
dataset_adult.loc[dataset_adult["income"] == " >50K", "income"] = 1

# data_adult_processed = MultiColumnLabelEncoder(columns = cat_col).fit_transform(dataset_adult)

In [9]:
# data_adult_processed.head()

In [10]:
# data_adult_processed.to_csv(r'data/process_adult.csv', index=False)
dataset_adult.to_csv(r'../data/process_adult.csv', index=False)

### Credit Dataset (Unbalanced)

#### Upload Data Locally

In [12]:
df_credit = pd.read_csv('../data/creditcard.csv')

In [13]:
# The classes are heavily skewed we need to solve this issue later.
print('No Frauds', round(df_credit['Class'].value_counts()[0]/len(df_credit) * 100,2), '% of the dataset')
print('Frauds', round(df_credit['Class'].value_counts()[1]/len(df_credit) * 100,2), '% of the dataset')

No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset


In [14]:
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df_credit['scaled_amount'] = rob_scaler.fit_transform(df_credit['Amount'].values.reshape(-1,1))
df_credit['scaled_time'] = rob_scaler.fit_transform(df_credit['Time'].values.reshape(-1,1))

df_credit.drop(['Time','Amount'], axis=1, inplace=True)

In [15]:
scaled_amount = df_credit['scaled_amount']
scaled_time = df_credit['scaled_time'] 

df_credit.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df_credit.insert(0, 'scaled_amount', scaled_amount)
df_credit.insert(1, 'scaled_time', scaled_time)

# Amount and Time are Scaled

df_credit.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


### Credit Dataset (Balanced)

In [16]:
df_credit_balance = df_credit.copy()

#### Without Undersampling

In [17]:
#get X and y
X = df_credit_balance.drop('Class', axis =1)
y = df_credit_balance['Class'].astype('int')
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 284315, 1: 492})
Counter({0: 284315, 1: 284315})


In [18]:
df_credit_bal = X
df_credit_bal['Class'] = y

#### With Undersampling (One we will use)

In [19]:
#get X and y
X = df_credit_balance.drop('Class', axis =1)
y = df_credit_balance['Class'].astype('int')

# summarize class distribution
counter = Counter(y)
print(counter)

#define pipleine
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X, y = pipeline.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)


Counter({0: 284315, 1: 492})
Counter({0: 56862, 1: 28431})


In [20]:
df_credit_bal_under = X
df_credit_bal_under['Class'] = y

In [21]:
df_credit_bal_under.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
80406,6.440578,-0.308051,0.377836,-2.475003,1.079772,0.229062,-2.479622,-0.06164,-0.764186,0.022007,...,0.988649,0.397625,0.349268,-0.433377,0.800701,0.222112,-0.189243,-0.016041,0.112632,0
102707,0.449242,-0.192272,-7.552255,4.713263,-2.727334,0.974608,-3.298558,0.114505,-2.064925,2.52676,...,-1.027275,0.457748,-0.002029,0.310889,0.164761,0.154349,-0.523684,-4.01796,0.077316,0
67064,0.300286,-0.379974,1.323398,-0.609295,0.810854,-0.628913,-1.319789,-0.843542,-0.723782,-0.157797,...,0.185091,0.109267,0.174762,0.026068,0.412914,0.266044,-0.33999,0.023604,0.03342,0
229862,1.123454,0.721237,2.106113,-1.772438,-0.951809,-1.593071,-1.411557,-0.369299,-1.214717,-0.046858,...,-0.29291,-0.095116,-0.012034,0.117059,-0.53148,-0.280462,-0.174319,-0.005162,-0.048103,0
118453,3.124153,-0.112877,-6.107283,1.555293,-2.828805,0.066361,-4.982163,-0.232961,-1.720388,3.445435,...,-1.356991,0.046359,-0.368387,-0.227321,0.546217,-0.814859,0.588336,-0.92204,-0.70084,0
