# Data Preprocessing

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

## Importing Dataset

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [None]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [None]:
train = train.drop(columns = ['PassengerId', 'Name', 'Cabin', 'Ticket'])

## Taking Care of Missing Data

In [None]:
train.nunique()

Survived      2
Pclass        3
Sex           2
Age          88
SibSp         7
Parch         7
Fare        248
Embarked      3
dtype: int64

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


Age and Embarked has missing values

Age - Fill Average Age
Embarked - Delete null rows

In [None]:
train = train.dropna(subset = ['Embarked'])

In [None]:
train[train.isna()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
...,...,...,...,...,...,...,...,...
886,,,,,,,,
887,,,,,,,,
888,,,,,,,,
889,,,,,,,,


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.NaN, strategy = 'mean')
train.Age = imputer.fit_transform(train['Age'].values.reshape(-1,1))

In [None]:
train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [None]:
train = train.to_numpy()

## Encoding Categorical Data

In [None]:
print(train)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train[:, 2] = le.fit_transform(train[:, 2])
print(train)

In [None]:
survived = train[:, 0]
survived
train = train[:, 1:]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(sparse_output = False, drop = 'first'), [-1])], remainder = 'passthrough')

In [None]:
transformed_train = ct.fit_transform(train)

## Splitting Dataset into training and test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(transformed_train,survived, test_size = 0.2, random_state = 1) #ratio of split, randomstate to make sure we have the same random factor
                                                                          # 8 will go in training and 2 in test

## Feature Scaling

In [None]:
print(x_train)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train[:, -1:] = sc.fit_transform(x_train[:, -1:]) #fit - compute mean and standard deviation of all the values
                          #transform - apply the formula to all the values
x_test[:, -1:] = sc.transform(x_test[:, -1:]) # no need to fit the training set

In [None]:
from sklearn.preprocessing import StandardScaler
sc2 = StandardScaler()

x_train[:, -4:-3] = sc2.fit_transform(x_train[:, -4:-3]) #fit - compute mean and standard deviation of all the values
                          #transform - apply the formula to all the values
x_test[:, -4:-3] = sc2.transform(x_test[:, -4:-3]) # no need to fit the training set

In [None]:
print(x_train)

[[0.0 1.0 3 1 -0.013763343142133946 0 0 -0.5475598981576247]
 [0.0 1.0 3 1 -0.823296222307722 0 0 -0.4948313461898532]
 [1.0 0.0 3 1 -0.013763343142133946 0 0 -0.5550186710689468]
 [0.0 1.0 3 1 -0.013763343142133946 0 0 -0.5475598981576247]
 [0.0 0.0 1 1 2.14339305464793 0 2 2.068604700488532]
 [0.0 1.0 3 1 -1.051503089765849 4 1 0.23902986178219737]
 [1.0 0.0 3 0 -0.013763343142133946 0 0 -0.5580220702945723]
 [0.0 1.0 3 0 -0.013763343142133946 8 2 0.981488548663365]
 [1.0 0.0 3 0 -0.013763343142133946 1 0 -0.36233370419313093]
 [0.0 1.0 1 1 0.4698760266216648 0 0 -0.0916424039530736]
 [0.0 1.0 1 0 1.6109103639123001 1 0 1.190540549564258]
 [0.0 1.0 3 1 0.165600203344162 0 0 -0.5513937074340443]
 [0.0 1.0 1 1 1.5348414080929245 1 0 0.6421143811982184]
 [0.0 1.0 1 1 1.154496628996046 1 0 1.3276999246305565]
 [0.0 1.0 3 1 -0.5950893548495949 0 0 -0.5705577813008673]
 [0.0 1.0 2 1 -0.36688248739146784 1 0 -0.10127665229686438]
 [0.0 1.0 3 1 0.4698760266216648 1 1 -0.14727241858334947]
 [

# Model Selection

In [None]:
print(x_train)

[[0.0 1.0 3 ... 0 0 -0.5475598981576247]
 [0.0 1.0 3 ... 0 0 -0.4948313461898532]
 [1.0 0.0 3 ... 0 0 -0.5550186710689468]
 ...
 [0.0 0.0 3 ... 1 0 -0.38833498656199905]
 [0.0 1.0 2 ... 1 0 -0.10127665229686439]
 [0.0 1.0 3 ... 0 0 -0.5475598981576247]]


In [None]:
y_train=y_train.astype('int')

In [None]:
y_test=y_test.astype('int')

## Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
decision_tree_classifier.fit(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_dtc = decision_tree_classifier.predict(x_test)
# cm = confusion_matrix(y_test, y_pred_dtc)
# print(cm)
accuracy_score(y_test, y_pred_dtc)

0.7584269662921348

In [None]:
type(y_train)

numpy.ndarray

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train, y_train)

In [None]:
y_pred_knn = knn_classifier.predict(x_test)
# cm = confusion_matrix(y_test, y_pred_knn)
# print(cm)
accuracy_score(y_test, y_pred_knn)

0.8089887640449438

## Kernel SVM

In [None]:
from sklearn.svm import SVC
kernel_svm_classifier = SVC(kernel = 'rbf', random_state = 0)
kernel_svm_classifier.fit(x_train, y_train)

In [None]:
y_pred_ksvm = kernel_svm_classifier.predict(x_test)
# cm = confusion_matrix(y_test, y_pred_ksvm)
# print(cm)
accuracy_score(y_test, y_pred_ksvm)

0.8426966292134831

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_regression_classifier = LogisticRegression(random_state = 0)
logistic_regression_classifier.fit(x_train, y_train)

In [None]:
y_pred_lr = logistic_regression_classifier.predict(x_test)
# cm = confusion_matrix(y_test, y_pred_lr)
# print(cm)
accuracy_score(y_test, y_pred_lr)

0.8426966292134831

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(x_train, y_train)

In [None]:
y_pred_nb = naive_bayes_classifier.predict(x_test)
# cm = confusion_matrix(y_test, y_pred_nb)
# print(cm)
accuracy_score(y_test, y_pred_nb)

0.8202247191011236

## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
random_forest_classifier.fit(x_train, y_train)

In [None]:
y_pred_rfc = random_forest_classifier.predict(x_test)
# cm = confusion_matrix(y_test, y_pred_rfc)
# print(cm)
accuracy_score(y_test, y_pred_rfc)

0.8146067415730337

## SVM

In [None]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel = 'linear', random_state = 0)
svm_classifier.fit(x_train, y_train)

In [None]:
y_pred_svm = svm_classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred_svm)
print(cm)
accuracy_score(y_test, y_pred_svm)

[[93 12]
 [18 55]]


0.8314606741573034

# Test Submission

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [None]:
test = test.drop(columns = ['Name', 'Cabin', 'Ticket'])

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         417 non-null    float64
 7   Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


## Dealing with missing values

In [None]:
imputer_age = SimpleImputer(missing_values = np.NaN, strategy = 'mean')
test.Age = imputer_age.fit_transform(test['Age'].values.reshape(-1,1))

In [None]:
imputer_fare = SimpleImputer(missing_values = np.NaN, strategy = 'mean')
test.Fare = imputer_fare.fit_transform(test['Fare'].values.reshape(-1,1))

In [None]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [None]:
PassengerId = test.PassengerId.to_frame()

In [None]:
test2 = test.copy()

In [None]:
test = test.to_numpy()

## Encoding Categorical

In [None]:
le = LabelEncoder()
test[:, 2] = le.fit_transform(test[:, 2])
print(test)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(sparse_output = False, drop = 'first'), [-1])], remainder = 'passthrough')

In [None]:
transformed_test = ct.fit_transform(test)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

transformed_test[:, -1:] = sc.fit_transform(transformed_test[:, -1:]) #fit - compute mean and standard deviation of all the values
                          #transform - apply the formula to all the values
transformed_test[:, -1:] = sc.transform(transformed_test[:, -1:]) # no need to fit the training set

In [None]:
from sklearn.preprocessing import StandardScaler
sc2 = StandardScaler()

transformed_test[:, -4:-3] = sc2.fit_transform(transformed_test[:, -4:-3]) #fit - compute mean and standard deviation of all the values
                          #transform - apply the formula to all the values
transformed_test[:, -4:-3] = sc2.transform(transformed_test[:, -4:-3]) # no need to fit the training set

In [None]:
print(transformed_test)

In [None]:
np.delete(transformed_test, [2], 1)

(418, 8)

## Predictions

In [None]:
test_pred = kernel_svm_classifier.predict(np.delete(transformed_test, [2], 1))
print(test_pred.reshape(-1,1))

In [None]:
ans_array = np.concatenate((PassengerId.values.reshape(len(PassengerId.values),1), test_pred.reshape(len(test_pred),1)),1)


In [None]:
ans = pd.DataFrame(ans_array, columns = ['PassengerId','Survived'])
type(ans)

pandas.core.frame.DataFrame

In [None]:
ans.to_csv('submission.csv', index=False)