In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pickle
from sklearn.utils import resample
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import math
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from airsyn.structured_data.tabular.tabular_data_generator import TabularDataGenerator


# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



# Other Libraries
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline
from numpy import where
from matplotlib import pyplot

### Adult Dataset

#### Get Data remotely

In [8]:
! mkdir -p data/adult
! wget http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
! mv adult.data data/adult/adult1.csv
! sed  -i -e '1i"age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"' data/adult/adult1.csv

--2023-07-10 18:31:39--  http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving proxy.jpmchase.net (proxy.jpmchase.net)... 155.180.110.35
Connecting to proxy.jpmchase.net (proxy.jpmchase.net)|155.180.110.35|:10443... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘adult.data’

    [                        <=>            ] 3,974,305    816KB/s   in 4.8s   

2023-07-10 18:31:46 (803 KB/s) - ‘adult.data’ saved [3974305]



In [38]:
dataset_adult = pd.read_csv('data/adult/adult1.csv')

In [39]:
dataset_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [40]:
dataset_adult.replace(' ?', np.NaN,inplace=True)
dataset_adult.dropna(axis=0,how='any',inplace=True)
dataset_adult = dataset_adult.drop('education', axis =1) #drop education col as its the same as education-num

In [41]:
dataset_adult.shape

(30162, 14)

In [42]:
# Create class for encoding
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
cat_col = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [43]:
dataset_adult.loc[dataset_adult["salary"] == " <=50K", "salary"] = 0
dataset_adult.loc[dataset_adult["salary"] == " >50K", "salary"] = 1

data_adult_processed = MultiColumnLabelEncoder(columns = cat_col).fit_transform(dataset_adult)

In [44]:
data_adult_processed.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,5,77516,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,13,2,9,5,2,0,0,0,40,4,0


In [37]:
data_adult_processed.to_csv(r'data/adult/process_adult.csv', index=False)

### Credit Dataset (Unbalanced)

#### Upload Data Locally

In [15]:
df_credit = pd.read_csv('data/CreditData/creditcard.csv')

In [17]:
# The classes are heavily skewed we need to solve this issue later.
print('No Frauds', round(df_credit['Class'].value_counts()[0]/len(df_credit) * 100,2), '% of the dataset')
print('Frauds', round(df_credit['Class'].value_counts()[1]/len(df_credit) * 100,2), '% of the dataset')

No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset


In [18]:
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df_credit['scaled_amount'] = rob_scaler.fit_transform(df_credit['Amount'].values.reshape(-1,1))
df_credit['scaled_time'] = rob_scaler.fit_transform(df_credit['Time'].values.reshape(-1,1))

df_credit.drop(['Time','Amount'], axis=1, inplace=True)

In [19]:
scaled_amount = df_credit['scaled_amount']
scaled_time = df_credit['scaled_time'] 

df_credit.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df_credit.insert(0, 'scaled_amount', scaled_amount)
df_credit.insert(1, 'scaled_time', scaled_time)

# Amount and Time are Scaled

df_credit.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


### Credit Dataset (Balanced)

In [20]:
df_credit_balance = df_credit.copy()

#### Without Undersampling

In [21]:
#get X and y
X = df_credit_balance.drop('Class', axis =1)
y = df_credit_balance['Class'].astype('int')
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 284315, 1: 492})
Counter({0: 284315, 1: 284315})


In [22]:
df_credit_bal = X
df_credit_bal['Class'] = y

#### With Undersampling (One we will use)

In [23]:
#get X and y
X = df_credit_balance.drop('Class', axis =1)
y = df_credit_balance['Class'].astype('int')

# summarize class distribution
counter = Counter(y)
print(counter)

#define pipleine
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X, y = pipeline.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)


Counter({0: 284315, 1: 492})
Counter({0: 56862, 1: 28431})


In [24]:
df_credit_bal_under = X
df_credit_bal_under['Class'] = y

In [25]:
df_credit_bal_under.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.353734,-0.51267,1.032312,-0.586667,0.755738,0.673391,-0.852152,0.178366,-0.534427,0.117112,...,0.128262,-0.089203,-0.329437,-0.159572,-0.419311,0.288348,0.464377,-0.015733,0.035298,0
1,-0.158597,-0.236093,1.108539,0.247769,1.044287,2.414754,-0.607787,-0.016459,-0.445858,0.241531,...,-0.230454,0.175196,0.358236,-0.053834,0.292676,0.343305,0.084936,-0.003799,0.016838,0
2,-0.265633,0.717948,-0.06445,1.334184,-0.270863,-0.237119,0.859004,-1.354791,1.211153,-0.35405,...,0.125899,0.293984,1.1763,-0.271509,0.011916,-0.380263,-0.150342,0.486483,0.310489,0
3,-0.294977,1.016154,-0.659585,1.167179,-0.009046,-0.973386,0.687607,-0.453838,0.926882,-0.274608,...,-0.252975,0.006123,-0.295617,0.101158,-0.480826,-1.18686,-0.040645,-0.668903,0.201928,0
4,3.388528,0.019702,-3.840501,0.790885,-0.211265,0.746274,-3.137159,0.783779,-0.578036,1.940393,...,-0.752935,0.305909,0.554569,-0.444283,0.179683,0.219392,0.663086,-0.73226,-0.666694,0
