# Data_Preprocessing 

Here are some examples of code implementation for data preprocessing.

## Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn 

In [2]:
dataset_raw = pd.read_csv('../DataSets/Titanic.csv')

In [3]:
dataset_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Fsize,Family,FsizeD,Deck,Child,Mother
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,2,Braund_2,small,,Adult,Not Mother
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,2,Cumings_2,small,C,Adult,Not Mother
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen,1,Heikkinen_1,singleton,,Adult,Not Mother
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle,2,Futrelle_2,small,C,Adult,Not Mother
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen,1,Allen_1,singleton,,Adult,Not Mother


## Remove unrelevant data

Removing unrelevant columns and the ones we will not use as features:

In [4]:
dataset = dataset_raw.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Title', 'Surname', 'Fsize', 'Family', 'FsizeD', 'Deck', 'Child', 'Mother'], axis=1)
dataset.sample(n=5, random_state = 20)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
347,1,3,female,24.0,16.1,S
674,0,2,male,26.0,0.0,S
791,0,2,male,16.0,26.0,S
836,0,3,male,21.0,8.6625,S
56,1,2,female,21.0,10.5,S


Removing unrelevant rows:

In [5]:
dataset.duplicated().sum()

76

In [6]:
dataset = dataset.drop_duplicates()
dataset.duplicated().sum()

0

In [7]:
(dataset['Fare'] <= 0).sum()

15

In [8]:
dataset = dataset.drop(dataset[dataset['Fare'] <= 0].index, axis=0)
(dataset['Fare'] <= 0).sum()

0

In [9]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.925,S
3,1,1,female,35.0,53.1,S
4,0,3,male,35.0,8.05,S


## Prepare features and response

Response:

In [10]:
# y = dataset['Survived']
y = dataset.iloc[:, 0]
print(y[:6])

0    0
1    1
2    1
3    1
4    0
5    0
Name: Survived, dtype: int64


Features:

In [11]:
# X = dataset.drop('Survived', axis=1)
X = dataset.iloc[:, 1:]
print(X[:6])

   Pclass     Sex   Age     Fare Embarked
0       3    male  22.0   7.2500        S
1       1  female  38.0  71.2833        C
2       3  female  26.0   7.9250        S
3       1  female  35.0  53.1000        S
4       3    male  35.0   8.0500        S
5       3    male  21.0   8.4583        Q


## Data encoding

Label encoding for a boolean data:

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(X['Sex'])
X['Sex'] = label_encoder.transform(X['Sex'])
print(X[:6])

   Pclass  Sex   Age     Fare Embarked
0       3    1  22.0   7.2500        S
1       1    0  38.0  71.2833        C
2       3    0  26.0   7.9250        S
3       1    0  35.0  53.1000        S
4       3    1  35.0   8.0500        S
5       3    1  21.0   8.4583        Q


One hot encoder for a categorical data:

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
oh_encoder = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Embarked'])], remainder='passthrough')
X = oh_encoder.fit_transform(X)
print(X[:6])

[[ 0.      0.      1.      3.      1.     22.      7.25  ]
 [ 1.      0.      0.      1.      0.     38.     71.2833]
 [ 0.      0.      1.      3.      0.     26.      7.925 ]
 [ 0.      0.      1.      1.      0.     35.     53.1   ]
 [ 0.      0.      1.      3.      1.     35.      8.05  ]
 [ 0.      1.      0.      3.      1.     21.      8.4583]]


## Data set split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [15]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(600, 7)
(200, 7)
(600,)
(200,)


In [16]:
X_train[:6,:]

array([[  0.    ,   0.    ,   1.    ,   3.    ,   1.    ,  16.    ,
          9.2167],
       [  1.    ,   0.    ,   0.    ,   1.    ,   1.    ,  22.    ,
        135.6333],
       [  1.    ,   0.    ,   0.    ,   1.    ,   0.    ,  31.    ,
        113.275 ],
       [  0.    ,   0.    ,   1.    ,   2.    ,   0.    ,  50.    ,
         10.5   ],
       [  0.    ,   0.    ,   1.    ,   1.    ,   0.    ,  53.    ,
         51.4792],
       [  0.    ,   0.    ,   1.    ,   3.    ,   1.    ,  38.    ,
          7.05  ]])

## Scaling

In [17]:
X_train[:6,5:]

array([[ 16.    ,   9.2167],
       [ 22.    , 135.6333],
       [ 31.    , 113.275 ],
       [ 50.    ,  10.5   ],
       [ 53.    ,  51.4792],
       [ 38.    ,   7.05  ]])

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[:, 5:] = scaler.fit_transform(X_train[:, 5:])
X_test[:, 5:] = scaler.transform(X_test[:, 5:])

In [19]:
X_train[:6,5:]

array([[-0.93078891, -0.5057674 ],
       [-0.52542492,  1.94331161],
       [ 0.08262106,  1.51016246],
       [ 1.36627369, -0.48090592],
       [ 1.56895568,  0.31298743],
       [ 0.55554571, -0.54774305]])

In [20]:
print(X_train[:6,:])

[[ 0.          0.          1.          3.          1.         -0.93078891
  -0.5057674 ]
 [ 1.          0.          0.          1.          1.         -0.52542492
   1.94331161]
 [ 1.          0.          0.          1.          0.          0.08262106
   1.51016246]
 [ 0.          0.          1.          2.          0.          1.36627369
  -0.48090592]
 [ 0.          0.          1.          1.          0.          1.56895568
   0.31298743]
 [ 0.          0.          1.          3.          1.          0.55554571
  -0.54774305]]
