In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Imports

First we import the resources (we will be adding more while we would need it)

In [None]:
# imports 
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC


#### Data visualization
First we are going to charge the data and take a peek to guess what is the situation.


In [None]:
train_path = "/kaggle/input/titanic/train.csv"
test_path  = "/kaggle/input/titanic/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# train_data = train.copy()
# test_data = test.copy()

In [None]:
# A quick peek to the data we are going to handle
train.head(5)

In [None]:
#Some info about the data
print('General information')
train.info()
print('--------')
print('Percentage of NA per property sorted')
print('--------')
p = (train.isna().sum()/len(train)*100).sort_values(ascending=False)
print(p)
print('--------')
print('Unique values for duplications and other useful info')
print('--------')
u = train.nunique().sort_values()
print(u)

Now lets see some plots that will help us to figure out some relations in the data


In [None]:
# Heatmap 
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, cmap='Blues')
plt.show()

In [None]:
sns.heatmap(train.isnull(), yticklabels=False, cbar=False)
plt.title('Missing values distribution')
plt.show()

In [None]:
# 
Pclass=['class1','class2','class3']
ax=sns.countplot(data=train,x='Pclass',hue='Survived')
plt.xticks(ticks = [0,1,2], labels = Pclass)
plt.legend(['Deceased', 'Survived'])
plt.show()

### Data cleaning

So we have the following situation:

#### Missing values:
* Case 1: **'Cabin'** 77% of missing values. As long as there is 3/4 of the data missing if we would decide to mock the data it would not be trustable as long as we are are setting it by ourselves, so the most fair way to proceed is to drop this one

* Case 2: **'Age'** with 20% of missing values. With a 20% of missing values we should try to fill following some strategy in order to apply the filling closer to what would be

* Case 3: **'Embarked'** with 0.2% of missing values. Less than a 0.5% of missing values let us to take a different strategy as long as filling the missing values would affect nearly nothing to results. So in this case we will drop the cases where this property is not present

#### Categorical values
We also have categorical variables that need to be encoded or dropped
* Case 1: **'Sex'** as long as it only has 2 possibles values we can do it manually or by a label encoder.

* Case 2: **'Name'** This property doesn't give useful info so drop is the best option.

* Case 3: **'Ticket'** This property doesn't give useful info. Dtrop is the best option too.

* Case 4: **'Cabin'** drop by missing 70% of values, also not very useful info at first sight. Maybe with less missing could be useful as "travellers on stern side of the boat survived more than travellers on bow side", but 77.1% is too much missing.

* Case 5: **'Embarked'** has 3 possible values. I could use one-hot but for now I feel more confident doing by hand (considering this is my first attemp on Kaggle).

In [None]:
# Check the values of Embarked for manual replacement
train['Embarked'].value_counts()

In [None]:
# Check the values of Fare for manual replacement into categories
train['Fare'].value_counts(bins=5)

In [None]:
def cleanData(data):
    
    # Data missing and categorical to drop
    data.drop(['Cabin','Name','Ticket'], axis=1, inplace=True)

    # Data missing Case2
    data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
    # FARE Data missing in test
    data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

    # Data missing Case3
    data.dropna(axis=0, subset=['Embarked'], inplace=True)
    
    # Categorical Data
    le = preprocessing.LabelEncoder()
    
    # Sex
    data['Sex'].replace({'male':0, 'female':1}, inplace=True)
    
    # Embarked
    data['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return data

In [None]:
clean_train = cleanData(train)
clean_test = cleanData(test)

#### Check cleaning

After cleaning data we have to check that all is going well

In [None]:
clean_train.info()
clean_test.info()

#### Modeling

With the data cleaned we proceed to train and test models.

In [None]:
# Set X and y
y = train['Survived']
X = pd.get_dummies(train.drop('Survived', axis=1))

# # Polynomial features
# features = PolynomialFeatures(degree=2)
# X = features.fit_transform(X)

# # Standard Scaler
# sc = StandardScaler()
# X = sc.fit_transform(X)

# Split model train test data
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=42)


In [None]:
def fitAndPredict(model):
    """The following code makes faster to evaluate a model 
    automating the fit and accuracy process"""
    
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    return accuracy_score(y_val, prediction)

In [None]:
#Lets some models
model1 = LogisticRegression(solver='liblinear', random_state=42)
model2 = GradientBoostingClassifier()
model3 = RandomForestClassifier(n_estimators=50)
model4 = SGDClassifier()
model5 = SVC()

models = [model1, model2, model3, model4, model5]
i = 0
for model in models:
    i +=1
    print("Model ", i,":", model)
    print("ACC: ", fitAndPredict(model))

In [None]:
#As long as GradientBoost is the best of the tried ones lets tune it a bit
model = GradientBoostingClassifier(min_samples_split=20, min_samples_leaf=60, max_depth=4, max_features=5)
fitAndPredict(model)

In [None]:
# Polynomial features
# id = clean_test['PassengerId']
# features = PolynomialFeatures(degree=2)
# clean_test = features.fit_transform(clean_test)

# # Standard Scaler
# sc = StandardScaler()
# clean_test = sc.fit_transform(clean_test)

In [None]:
#Deliver (After delivering tunned gradient it seems to have less punctuation than default 1)
predict = model2.predict(clean_test)

#output = pd.DataFrame({'PassengerId': id, 'Survived': predict})
output = pd.DataFrame({'PassengerId': clean_test.PassengerId, 'Survived': predict})
output.to_csv('my_submission.csv', index=False)
print("Submission saved")


---