### Titanic | ML Classification Prediction Algorithms with accuracy (79.2%)

### This notebook divided into 3 parts:
> EDA

> Data Cleaning & Encoding

> ML Clasification Prediction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn import svm
import warnings

In [None]:
warnings.filterwarnings('ignore')

## EDA

In [None]:
df_train = pd.read_csv("../input/titanic/train.csv",index_col="PassengerId")
df_train.head()

In [None]:
df_test = pd.read_csv("../input/titanic/test.csv", index_col="PassengerId")
df_test.head()

In [None]:
df_train.Parch.unique()

In [None]:
df_test.info()

In [None]:
Y_test = pd.read_csv("../input/titanic/gender_submission.csv", index_col="PassengerId")
Y_test.head()

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

## Data Cleaning & Encoding

In [None]:
msno.matrix(df_train)
plt.show()

In [None]:
df_train[["Embarked","Name"]].groupby(by=["Embarked"],
                    as_index=True).count().sort_values("Name",ascending=False)

In [None]:
plt.style.use("seaborn")
plt.bar(df_train.Embarked.dropna().unique(), [*df_train.Embarked.value_counts()],
        color="#0b91a3",width=0.4, label="Embarked bar plot")
plt.show()

In [None]:
# so we notes that Most repeated Embarked is S, so that we can replace null value in Embarked column with it.
most_repeated = "S"
df_train.Embarked.replace(np.nan, most_repeated, inplace=True)
df_test.Embarked.replace(np.nan, most_repeated, inplace=True)
print("the number of null value in Embarked Column =",df_train.Embarked.isnull().sum())

In [None]:
# transform Embarked Column to numeric.
Embarked_transform_dict = {"S":1, "C":2, "Q":3}
for value in Embarked_transform_dict:
    df_train.Embarked.replace(value, Embarked_transform_dict.get(value), inplace=True)
    df_test.Embarked.replace(value, Embarked_transform_dict.get(value), inplace=True)
df_train.head(5)
# we now finish cleaning and transform column Emvarked to numeric.

In [None]:
print("the number of null value in Cabin Column =", df_train.Cabin.isnull().sum())
# notes that null values is 687 from 891 (77%) of Cabin column is null, so i will droped it from data.

In [None]:
df_train.drop("Cabin", axis=1, inplace=True)
df_test.drop("Cabin", axis=1, inplace=True)

df_train

In [None]:
# Enter to Fare column
# calculate the range of value in Fare column.
print("Range of Fare column values = ", df_train.Fare.max() - df_train.Fare.min())
# min value is 0.0 and max value = 512.3292
# i will divied this range to 10 sections.
df_test.Fare.replace(np.nan, df_test.Fare.mean(), inplace=True)
print("Range of Fare column values = ", df_test.Fare.max() - df_test.Fare.min())

In [None]:
df_train.Fare = df_train.Fare.astype("int64")
df_test.Fare = df_test.Fare.astype("int64")

# df_train.info()
df_test

In [None]:
bins_i = [-1, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
labels_i = [1,2,3,4,5,6,7,8,9,10,11]

df_train['stage'] = 0
df_train['stage'] = pd.cut(df_train.Fare, bins=bins_i, labels=labels_i)

df_test['stage'] = 0
df_test['stage'] = pd.cut(df_test.Fare, bins=bins_i, labels=labels_i)

df_train.stage.unique()

In [None]:
df_train.Fare = df_train.stage.astype("int64")
df_test.Fare = df_test.stage.astype("int64")
df_train.drop("stage", axis=1, inplace=True)
df_test.drop("stage", axis=1, inplace=True)

In [None]:
df_train.head()

In [None]:
df_test.Fare.unique()

In [None]:
len(df_train.Ticket.unique())

In [None]:
# i drop this column, because this is column is outlayer of data not need.
df_train.drop("Ticket", axis=1, inplace=True)
df_test.drop("Ticket", axis=1, inplace=True)

In [None]:
df_train.head()

In [None]:
plt.style.use("seaborn")
plt.figure(figsize=(4,4))
plt.bar(df_train.Sex.dropna().unique(), [*df_train.Sex.value_counts()],
        color="#0b91a3",width=0.3, label="Embarked bar plot")
plt.show()

In [None]:
# Sex column.
Sex_dict = {"male":1, "female":2}
for key, value in Sex_dict.items():
    df_train.Sex.replace(key, value, inplace=True)
    df_test.Sex.replace(key, value, inplace=True)
df_train.Sex = df_train.Sex.astype("int64")
df_test.Sex = df_test.Sex.astype("int64")
df_train.head()

In [None]:
df_train["Title"] = 0

titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
data = [df_train, df_test]
for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace(['Mlle','Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
df_train = df_train.drop(['Name'], axis=1)
df_test = df_test.drop(['Name'], axis=1)
df_train

In [None]:
df_train.isnull().sum()

In [None]:
df_Age_train = df_train.loc[pd.notna(df_train.Age)]
df_Age_train.Age = df_Age_train.Age.astype("float64")

plt.hist(df_Age_train.Age,bins=20,color="#2d6bad")
plt.show()


In [None]:
df_Age_train.Age = (df_Age_train.Age - df_Age_train.Age.mean()) / df_Age_train.Age.std()
df_Age_train

In [None]:
df_Age_train.Survived.corr(df_Age_train.Age)
# so i will drop Age column, because it is correlation between Age and Survived is very small.

In [None]:
df_train.drop("Age", axis=1, inplace=True)
df_test.drop("Age", axis=1, inplace=True)
df_train

In [None]:
df_test

In [None]:
data = [df_train, df_test]
for dataset in data:
    dataset['FamilySize'] = dataset['SibSp'] +  dataset['Parch'] + 1

In [None]:

for dataset in data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

print (df_train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())

In [None]:
colormap=plt.cm.RdBu
figure = plt.figure(figsize=(12,12))
sns.heatmap(df_train.corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
plt.title("Correlations",size=15)
plt.xlabel("Features")
plt.ylabel("Features")
plt.show()

In [None]:
columns = ["Pclass","Sex", "Fare", "Embarked","Title","IsAlone"]
X_train = df_train[columns]
Y_train = df_train["Survived"]
len(Y_train)

In [None]:
X_test = df_test[columns]
len(X_test)

In [None]:
len(Y_test)

## ML Clasification Prediction

### 1- SGDClassifier (66%)

In [None]:
sgd_clf = SGDClassifier(random_state=42, max_iter=1000, tol=1e-3)
sgd_clf.fit(X_train, Y_train)
Y_pred_SGD = sgd_clf.predict(X_test)
print("the train score of SGD = ",round(sgd_clf.score(X_train, Y_train) *100, 2),"%")

### 2- Random Forest (78%)

In [None]:
random_forest = RandomForestClassifier(n_estimators=40, min_samples_leaf=2, max_features=0.1, n_jobs=-1)
random_forest.fit(X_train, Y_train)
Y_pred_Random = random_forest.predict(X_test)
print("the train score of random_forest = ",round(random_forest.score(X_train, Y_train) *100, 2),"%")

### 3- Logistic Regression (76.6%)

In [None]:
logistic_regression = LogisticRegression(solver='liblinear',max_iter=1000)
logistic_regression.fit(X_train, Y_train)
Y_pred_Logistic = logistic_regression.predict(X_test)
print("the train score of logistic_regression = ",round(logistic_regression.score(X_train, Y_train) *100, 2),"%")

### 4- Decision Tree (78%)

In [None]:
tree = DecisionTreeClassifier(random_state=25)
tree.fit(X_train, Y_train)
Y_pred_Tree= tree.predict(X_test)
print("the score of prediction = ",round(tree.score(X_train, Y_train) * 100,2), "%")

In [None]:
scores= cross_val_score(tree, X_train, Y_train, scoring="accuracy", cv=100)
scores.mean()

### 5- SVM (76.55%)

In [None]:

clf = svm.SVC(kernel = 'linear')
clf.fit(X_train, Y_train)
Y_predict_svm = clf.predict(X_test)
print("the score of prediction = ",round(clf.score(X_train, Y_train) * 100,2), "%")

### 6- KNeighbors (79.18%)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
Y_pred_KNN= knn.predict(X_test)
print("the score of prediction = ",round(knn.score(X_train, Y_train) * 100,2), "%")

In [None]:
output_csv = {"PassengerId":[*range(892,892+len(Y_pred_KNN))], "Survived":Y_pred_KNN}
Y_pre = pd.DataFrame(output_csv)
Y_pre.set_index("PassengerId", drop=True, append=False, inplace=True)
Y_pre.to_csv("/kaggle/working/submission.csv")

### 7- MlPClassifier (77.5%)

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3, 2), random_state=1)
clf.fit(X_train, Y_train)
Y_pred_clf= clf.predict(X_test)
print("the score of prediction = ",round(clf.score(X_train, Y_train) * 100,2), "%")

### 8- GaussianNB (74.6%)

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred_gaussian = gaussian.predict(X_test)
print("the train score for Gaussian = ", round(gaussian.score(X_train, Y_train) * 100, 2), "%")

### 9- Perceptron (78.2%)

In [None]:
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred_perceptron = perceptron.predict(X_test)
print("the train score for Perceptron = ",round(perceptron.score(X_train, Y_train) * 100, 2), "%")

## 10- XGBoost

In [None]:
import xgboost as xgb
xgb_classifer = xgb.XGBClassifier(objective='binary:logistic',
                                  eval_metric= 'logloss',
                                  max_depth=3)

xgb_classifer.fit(X_train, Y_train)
Y_pred_xgb = xgb_classifer.predict(X_test)
print("the train score for Perceptron = ",round(xgb_classifer.score(X_train, Y_train) * 100, 2), "%")

In [None]:
output_csv = {"PassengerId":[*range(892,892+len(Y_pred_xgb))], "Survived":Y_pred_xgb}
Y_pre = pd.DataFrame(output_csv)
Y_pre.set_index("PassengerId", drop=True, append=False, inplace=True)
Y_pre.to_csv("/kaggle/working/xgb_submission.csv")

In [None]:
model = ["SGDClassifier", "Random Forest", "Logistic Regression", "Decision Tree", "SVM",
"KNeighbors", "MlPClassifier", "GaussianNB", "Perceptron"]
score = [66, 78, 76.6, 78, 76.55, 79.18, 77.5, 74.6, 78.2]
data_dict = {"models": model, "test_score": score}
data_score = pd.DataFrame(data_dict)
data_score.index = data_score.index + 1
data_score.sort_values("test_score",ascending=False)

### Thanks for read my Notebook :) 