# Data Preprocessing tools

### Importing the libraries

In [3]:
import pandas as pd
import numpy as np


### Importing the dataset

In [4]:
df = pd.read_csv('titanic_data.csv')


df.drop(['deck','alive','embark_town','who','adult_male','parch','fare','embarked'], axis=1, inplace=True)


X = df.drop(['survived'], axis=1).values
y = df['survived'].values


### Taking care of the missing data

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X[:,2:3] = imputer.fit_transform(X[:,2:3])
print(X)


[[3 'male' 22.0 1 'Third' False]
 [1 'female' 38.0 1 'First' False]
 [3 'female' 26.0 0 'Third' True]
 ...
 [3 'female' 29.69869565217391 1 'Third' False]
 [1 'male' 26.0 0 'First' True]
 [3 'male' 32.0 0 'Third' True]]


## Encoding Categorical Data

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])],remainder='passthrough') # constructor initialization
X = np.array(ct.fit_transform(X))

X.shape


(889, 8)

### Encoding the dependent variable

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X[:,7] = le.fit_transform(X[:,7])
y = le.fit_transform(y)

print(X)




[[0.0 0.0 1.0 ... 22.0 1 0]
 [1.0 0.0 0.0 ... 38.0 1 0]
 [0.0 0.0 1.0 ... 26.0 0 1]
 ...
 [0.0 0.0 1.0 ... 29.69869565217391 1 0]
 [1.0 0.0 0.0 ... 26.0 0 1]
 [0.0 0.0 1.0 ... 32.0 0 1]]


### Splitting data into the training set and the test set

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

X

array([[0.0, 0.0, 1.0, ..., 22.0, 1, 0],
       [1.0, 0.0, 0.0, ..., 38.0, 1, 0],
       [0.0, 0.0, 1.0, ..., 26.0, 0, 1],
       ...,
       [0.0, 0.0, 1.0, ..., 29.69869565217391, 1, 0],
       [1.0, 0.0, 0.0, ..., 26.0, 0, 1],
       [0.0, 0.0, 1.0, ..., 32.0, 0, 1]], dtype=object)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,5:6] = sc.fit_transform(X_train[:,5:6])
X_test[:,5:6] = sc.fit_transform(X_test[:,5:6])


array([[0.0, 0.0, 1.0, ..., -0.010408910095821986, 1, 0],
       [0.0, 0.0, 1.0, ..., -0.010408910095821986, 1, 0],
       [0.0, 0.0, 1.0, ..., 0.16703628173832194, 0, 1],
       ...,
       [1.0, 0.0, 0.0, ..., 1.4007379744450077, 0, 1],
       [0.0, 0.0, 1.0, ..., -0.6811336319975244, 2, 0],
       [1.0, 0.0, 0.0, ..., -0.6811336319975244, 0, 0]], dtype=object)