# Part 2 Data Preprocessing

# Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer

# Read Data file

In [3]:
df = pd.read_csv('dataset.csv') 

# Check the number of samples in the dataset

In [4]:
print("The Number of Samples in the dataset: ", len(df))
print('Class 0(FALSE) :', round(df['y'].value_counts()[0]
                      /len(df) * 100, 2), '% of the dataset')
   
print('Class 1(TRUE)  :', round(df['y'].value_counts()[1]
                      /len(df) * 100, 2), '% of the dataset')

The Number of Samples in the dataset:  40000
Class 0(FALSE) : 88.72 % of the dataset
Class 1(TRUE)  : 11.28 % of the dataset


# Drop the row with missing value 

In [6]:
df=df.dropna(axis=0, subset=['x8'])
# Making a list of missing value types
missing_values = ["unknown"]
df = pd.read_csv("dataset.csv", na_values = missing_values)
df=df.dropna(axis=0, subset=['x8', 'x2', 'x3', 'x4', 'x5', 'x6'])

# Split features and target data

In [8]:
x=df.drop('y', axis=1)
y=df['y']

# Split numerical features and categorical features

In [9]:
x_num=x.drop(['x2', 'x3','x4', 'x5', 'x6', 'x7', 'x8'], axis=1)
x_cat=x[['x2', 'x3','x4', 'x5', 'x6', 'x7', 'x8']]

# Convert categorical features into numerical data using label encoder and one hot encoding

In [15]:
x_cat_2 = x_cat['x2']
x_cat_3 = x_cat['x3']
x_cat_4 = x_cat['x4']
x_cat_5 = x_cat['x5']
x_cat_6 = x_cat['x6']
x_cat_7 = x_cat['x7']
x_cat_8 = x_cat['x8']

#label encoder
labelencoder = LabelEncoder()
x_cat_4 = labelencoder.fit_transform(x_cat_4)

#One hot encoding - Label Binarizer
lb_encoder_2 = LabelBinarizer()
x_cat_2 = lb_encoder_2.fit_transform(x_cat_2)

lb_encoder_3 = LabelBinarizer()
x_cat_3 = lb_encoder_3.fit_transform(x_cat_3)

lb_encoder_5 = LabelBinarizer()
x_cat_5 = lb_encoder_5.fit_transform(x_cat_5)

lb_encoder_6 = LabelBinarizer()
x_cat_6 = lb_encoder_6.fit_transform(x_cat_6)

lb_encoder_7 = LabelBinarizer()
x_cat_7 = lb_encoder_7.fit_transform(x_cat_7)

lb_encoder_8 = LabelBinarizer()
x_cat_8 = lb_encoder_8.fit_transform(x_cat_8)

#convert numpy array to categorical array
x_cat_4 = pd.DataFrame(x_cat_4)

# stack all back all processed categorical data

In [13]:
x_cat_tr = np.hstack([x_cat_2, x_cat_3, x_cat_4, x_cat_5, x_cat_6, x_cat_7, x_cat_8])

# stack numerical and categorical data

In [16]:
x_tr = np.hstack([x_num, x_cat_tr])

# stack back the features and target data

In [17]:
y = pd.DataFrame(y)
df_tr = np.hstack([x_tr, y])
df_tr = pd.DataFrame(df_tr)

# Separate the output (y) from input (X)
### column 39 refer to the output y

In [18]:
x=df_tr.drop(38, axis=1)
y=df_tr[38]


# Split test set and training set



In [19]:
# 20% of dataset is test set and 80% of dataset is training set
#random set is set to 30

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test  = train_test_split(x, y, test_size=0.2, random_state=30)


# <--Data Preprocessing for the training set only -->

In [20]:
#Replacing the missing value with mode of the coloumn value
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy = 'most_frequent')
imputer.fit(x_train)
x_train_tr=imputer.transform(x_train)
x_train_tr_pd = pd.DataFrame(x_train_tr, columns = x_train.columns)
x_train_tr_pd.isnull().sum() #no missing value in numerical after replacement of data

#Resize the distribution of values so that the mean of the observed values is 0 and the standard deviation is 1
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler(copy=False)
scaler.fit(x_train_tr_pd)
x_train_tr=scaler.transform(x_train_tr_pd)
x_train_tr

#Combine the features and target value into a dataframe
y_train= pd.DataFrame(y_train)
df_train_tr = np.hstack([x_train_tr, y_train])
df_train_tr=pd.DataFrame(df_train_tr)

# Resampling method - Random Over Sampler

In [None]:
X_data = df_train_tr.iloc[:, :-1]
Y_data = df_train_tr.iloc[:, -1:]

ros = RandomOverSampler(random_state = 42)
   
X_res, y_res = ros.fit_resample(X_data, Y_data)
   
X_res = pd.DataFrame(X_res)
Y_res = pd.DataFrame(y_res)
   
   
print("After Over Sampling Of Minor Class Total Samples are :", len(Y_res))
print('Class 0        :', round(Y_res[38].value_counts()[0]/len(Y_res) * 100, 2), '% of the dataset')
   
print('Class 1(Fraud) :', round(Y_res[38].value_counts()[1]/len(Y_res) * 100, 2), '% of the dataset')

In [23]:
#Dataframe after resampling
df_train_tr_ros = np.hstack([X_res, Y_res])
df_train_tr_ros = pd.DataFrame(df_train_tr_ros)

In [24]:
#Separate the output (y) from input (X)
#y refer to classes
x_train=df_train_tr_ros.drop(38, axis=1)
y_train=df_train_tr_ros[38]
y_train

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
52853    1.0
52854    1.0
52855    1.0
52856    1.0
52857    1.0
Name: 38, Length: 52858, dtype: float64

# Continue in Part 3 