In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
train_d = pd.read_csv('train.csv')
test_d = pd.read_csv('test.csv')
df = pd.concat([train_d, test_d], ignore_index=True, sort=False)

In [None]:
#  create a heatmap
sns.heatmap(df.isnull(), cbar=False).set_title('Missing values')

In [None]:
df.nunique()  # show unique values

In [None]:
#  add a new column
df['Family'] = df.Parch + df.SibSp

In [None]:
#  more chanse for survive if passenger is alone
df['Is_Alone'] = df.Family == 0

In [None]:
# create a new column , labels is a ticket cost
df['Fare_Category'] = pd.cut(df['Fare'], bins=[0, 7.90, 14.45, 31.28, 120], labels=['Low', 'Mid', 'High_Mid','High'])

In [None]:
df.Embarked.fillna(df.Embarked.mode()[0], inplace = True)

In [None]:
df.Cabin = df.Cabin.fillna('NA')

In [None]:
df['Salutation'] = df.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

In [None]:
grp_by = df.groupby(['Sex', 'Pclass'])

In [None]:
grp_by.Age.apply(lambda x: x.fillna(x.median()))
df.Age.fillna(df.Age.median, inplace = True)

In [None]:
sns.distplot(df['Age'].dropna(), color='darkgreen', bins=30)

In [None]:
#  plot for 3 classes
for x in [1,2,3]:
    train_d.Age[train_d.Pclass == x].plot(kind="kde")
plt.title("Age density in classes")
plt.legend(("1st", "2nd", "3rd"))

In [None]:
#  plot gender density
for x in ["male", "female"]:
    df.Pclass[td.Sex == x].plot(kind="kde")
plt.title("Training Data - Gender density in classes")
plt.legend(("Male", "Female"))

In [None]:
#  create categorical labels
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])

In [None]:
df.get_dummies(df.Embarked, prefix="Emb", drop_first=True)

In [None]:
# drop some columns
df.drop(['Pclass', 'Fare','Cabin', 'Fare_Category','Name','Salutation', 'Deck', 'Ticket','Embarked',
         'Age_Range', 'SibSp', 'Parch', 'Age'], axis=1, inplace=True)

In [None]:
# random forest
clf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)

x_train, x_test, y_train, y_test = train_test_split(label_train, feature_train, test_size=0.2)

clf.fit(x_train,  np.ravel(y_train))
print("Accuracy: " + repr(round(clf.score(x_test, y_test) * 100, 2)) + "%")

In [None]:
result_rf = cross_val_score(clf,x_train,y_train,cv=10,scoring='accuracy')
print('Score for random forest is : ', round(result_rf.mean()*100, 2))

In [None]:
y_pred = cross_val_predict(clf, x_train, y_train, cv=10)
sns.heatmap(confusion_matrix(y_train, y_pred), annot=True, fmt='3.0f', cmap="summer")
plt.title('Confusion_matrix for RF', y=1.05, size=15)

In [None]:
result = clf.predict(X_to_be_predicted)
submission = pd.DataFrame({'PassengerId':X_to_be_predicted.PassengerId, 'Survived':result})
submission.Survived = submission.Survived.astype(int)
print(submission.shape)

In [None]:
filename = 'titanic_predict.csv'
submission.to_csv(filename,index=False)
print('File ' + filename + ' is saved !')