In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# Other Libraries
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline
from numpy import where

### Adult Dataset

#### Get Data remotely

In [None]:
# ! mkdir -p data/adult
# ! wget http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
# ! mv adult.data data/adult/adult1.csv
# ! sed  -i -e '1i"age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"' data/adult/adult1.csv

In [None]:
dataset_adult = pd.read_csv('../data/adult.csv')

In [None]:
dataset_adult.head()

In [None]:
dataset_adult.replace(' ?', np.NaN,inplace=True)
dataset_adult.dropna(axis=0,how='any',inplace=True)
dataset_adult = dataset_adult.drop('education', axis =1) #drop education col as its the same as education-num

In [None]:
dataset_adult.shape

In [None]:
# Create class for encoding
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
cat_col = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [None]:
dataset_adult.loc[dataset_adult["income"] == " <=50K", "income"] = 0
dataset_adult.loc[dataset_adult["income"] == " >50K", "income"] = 1

# data_adult_processed = MultiColumnLabelEncoder(columns = cat_col).fit_transform(dataset_adult)

In [None]:
# data_adult_processed.head()

In [None]:
# data_adult_processed.to_csv(r'data/process_adult.csv', index=False)
dataset_adult.to_csv(r'../data/process_adult.csv', index=False)

### Credit Dataset (Unbalanced)

#### Upload Data Locally

In [2]:
credit_card_data_dir = "../data/credit_card"

In [4]:
df_credit = pd.read_csv(credit_card_data_dir + '/creditcard.csv')

In [5]:
# The classes are heavily skewed we need to solve this issue later.
print('No Frauds', round(df_credit['Class'].value_counts()[0]/len(df_credit) * 100,2), '% of the dataset')
print('Frauds', round(df_credit['Class'].value_counts()[1]/len(df_credit) * 100,2), '% of the dataset')

No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset


In [6]:
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df_credit['scaled_amount'] = rob_scaler.fit_transform(df_credit['Amount'].values.reshape(-1,1))
df_credit['scaled_time'] = rob_scaler.fit_transform(df_credit['Time'].values.reshape(-1,1))

df_credit.drop(['Time','Amount'], axis=1, inplace=True)

In [7]:
scaled_amount = df_credit['scaled_amount']
scaled_time = df_credit['scaled_time'] 

df_credit.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df_credit.insert(0, 'scaled_amount', scaled_amount)
df_credit.insert(1, 'scaled_time', scaled_time)

# Amount and Time are Scaled

df_credit.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [8]:
df_credit.to_csv(credit_card_data_dir + "/credit_card_unbalanced.csv", index = False)

### Credit Dataset (Balanced)

In [9]:
df_credit_balance = df_credit.copy()

#### Without Undersampling

In [10]:
#get X and y
X = df_credit_balance.drop('Class', axis =1)
y = df_credit_balance['Class'].astype('int')
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 284315, 1: 492})
Counter({0: 284315, 1: 284315})


In [11]:
df_credit_bal = X
df_credit_bal['Class'] = y

#### With Undersampling (One we will use)

In [12]:
#get X and y
X = df_credit_balance.drop('Class', axis =1)
y = df_credit_balance['Class'].astype('int')

# summarize class distribution
counter = Counter(y)
print(counter)

#define pipleine
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X, y = pipeline.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)


Counter({0: 284315, 1: 492})
Counter({0: 56862, 1: 28431})


In [13]:
df_credit_bal_under = X
df_credit_bal_under['Class'] = y

In [14]:
df_credit_bal_under.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
193410,1.259834,0.533759,1.809775,-0.56436,-3.154637,-0.580605,2.436341,3.252096,-0.396497,0.805999,...,0.096316,-0.187252,-0.626876,0.136507,0.589529,-0.066872,-0.078523,-0.00719,-0.010111,0
241716,-0.279746,0.780754,1.891586,-0.060788,-1.850108,0.190082,1.241229,1.183192,-0.159239,0.372883,...,-0.297931,-0.249715,-0.468267,0.461889,-0.869079,-0.56969,0.303931,0.007856,-0.045961,0
79659,-0.005589,-0.312245,-1.042461,-0.201261,1.722029,-2.601335,-1.429197,-0.35415,-0.955273,0.66557,...,-0.276842,0.015172,0.198574,-0.252566,-0.039808,0.419139,-0.162309,0.172853,0.032109,0
260267,-0.254454,0.878535,-0.336478,1.099309,-0.91764,-1.504294,1.786349,-0.750739,1.846176,-0.591555,...,0.314591,0.098397,0.879769,-0.315362,0.317187,-0.257885,0.018494,0.26423,-0.046048,0
255154,0.409558,0.85055,1.94317,-0.69836,-0.422191,0.373606,-0.558475,0.349388,-0.929695,0.229442,...,-0.136495,-0.021004,-0.097254,0.237225,0.005168,-0.478709,0.177865,-0.013951,-0.029558,0


In [15]:
df_credit_bal_under.to_csv(credit_card_data_dir + "/credit_card_balanced.csv", index = False)