In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import re
import math
import matplotlib.pyplot as plt
filepath = "/kaggle/input/titanic/"
sample = pd.read_csv(filepath + "gender_submission.csv")
sample.head()

In [None]:
test_set = pd.read_csv(filepath + "test.csv")
test_set

In [None]:
training_set = pd.read_csv(filepath + "train.csv")
training_set

In [None]:
def multiple_replace(letters, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, letters.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: letters[mo.string[mo.start():mo.end()]], text) 

#Preprocesses data by substituting nonnumerical data and filling in Null data. 
#Additionally drops columns that are not important
def preprocess(keys, training_set):
    ageMean = training_set['Age'].mean()
    fareMean = training_set['Fare'].mean()
    training_set['Age'].fillna(ageMean, inplace=True)
    training_set['Fare'].fillna(fareMean, inplace = True) 
    training_set['Sex'] = training_set['Sex'].map({'female': 0, 'male': 1})
    training_set['Embarked'] = training_set['Embarked'].map({'S': 0, 'Q': 1, 'C':2})
    training_set['Embarked'].fillna(3, inplace = True) 
    count = 0
    for row in training_set['Ticket']:
        training_set.loc[count, 'Ticket'] = re.sub('[^0-9]','', row)
        training_set['Ticket'].replace({"": 0}, inplace=True)
        count+=1
    
    count = 0
    for row in training_set['Cabin']:
        if isinstance(row, float):
            training_set.loc[count, 'Cabin']=0.0
        else:
            training_set.loc[count, 'Cabin'] = float(multiple_replace(letters,row))
        count+=1
    training_set.drop(['PassengerId','Name'], axis=1, inplace=True)
    return training_set



In [None]:
#stores ids for later
ids = test_set['PassengerId']
letters = {
    " " : "",
    "A" : "1",
    "B" : "2",
    "C" : "3",
    "D" : "4",
    "E" : "5",
    "F" : "6",
    "G" : "7",
    "T" : "0"
  } 
training_set = preprocess(letters, training_set)
test_set = preprocess(letters, test_set)

In [None]:
training_set

In [None]:
#heatmap just to show possible important features
sns.set()
correlation = training_set.corr()
plt.figure(figsize=(15,15))
autoMatrix = sns.heatmap(
    correlation, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(0, 200, n=300),
    annot=True
)

In [None]:
print(training_set.isnull().sum())

In [None]:
print(test_set.isnull().sum())

In [None]:
x_train = training_set.drop(['Survived'], axis=1)
y_train = training_set['Survived']
x_test = test_set

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
x_train.shape, y_train.shape, x_test.shape

In [None]:
scaler = StandardScaler()
min_max = MinMaxScaler()
x_train_std = scaler.fit_transform(x_train)
x_test_std = scaler.fit_transform(x_test)
x_train_norm = min_max.fit_transform(x_train)
x_test_norm = min_max.fit_transform(x_test)

In [None]:
model = MLPClassifier(random_state = 3, early_stopping=True, verbose=2, learning_rate='adaptive', solver='adam')
parameters = {"batch_size": [1, 10, 25],
             "hidden_layer_sizes" : [(10), (10, 2), (10,10)],
             }
mlp = GridSearchCV(model, parameters, cv=6)
mlp.fit(x_train_norm, y_train)
print("Best values", mlp.best_params_)

In [None]:
from sklearn.metrics import classification_report
trainPred = mlp.predict(x_train_norm)
target_names = ['0', '1']
print("                         MLP Training Report", '\n')
print(classification_report(y_train, trainPred, target_names=target_names))

In [None]:
from sklearn.svm import SVC
parameters = {"C" : [0.01, 0.1, 1, 1.5, 2, 5, 10],
             "gamma" : [0.01, 0.1, 0.2, 0.3, 0.5, 0.75],
             "kernel" : ['rbf', 'sigmoid']}
model = SVC()
svm = GridSearchCV(model, parameters, cv=10)
svm.fit(x_train_std, y_train)
print("Best values", svm.best_params_)

In [None]:
from sklearn.metrics import classification_report
trainPred = svm.predict(x_train_std)
target_names = ['0', '1']
print("                         SVM Training Report", '\n')
print(classification_report(y_train, trainPred, target_names=target_names))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint
# parameters = {"n_estimators" : [25, 50, 100],
#              "max_depth" : [10, 15, 25],
#               "max_leaf_nodes": [25, 50, 100],
#              "min_samples_leaf" : [1, 2, 5]}
parameters = {"n_estimators" : randint(25,200),
             "max_depth" : randint(5,30),
              "max_leaf_nodes": randint(20,100),
             "min_samples_leaf": randint(1, 5)}
rf = RandomForestClassifier(n_jobs = -1)
forest = RandomizedSearchCV(rf, param_distributions=parameters, n_iter=50, cv=7)
# forest = GridSearchCV(rf, parameters, cv=10)
forest.fit(x_train, y_train)
print("Best values", forest.best_params_)

In [None]:
from sklearn.metrics import classification_report
trainPred = forest.predict(x_train)
target_names = ['0', '1']
print("                         RF Training Report", '\n')
print(classification_report(y_train, trainPred, target_names=target_names))

In [None]:
#Make predictions using the features from the test data set
#mlp, svm, or forest
predictions = forest.predict(x_test)
predictions

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
y_pred = forest.predict(x_train)
# y_pred = svm.predict(x_train_std)
cm = confusion_matrix(y_train, y_pred)
cm

In [None]:
submission = pd.DataFrame({'PassengerId':ids,'Survived':predictions})
submission.head()

In [None]:
filename = 'Titanic Predictions 1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)