In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

In [None]:
data.head()


In [None]:
def comp_plot(col):
    fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

    data[col].value_counts(normalize=True).plot(kind='bar', figsize=(15,5), ax=ax1)
    ax1.set_title('Train set')
    test[col].value_counts(normalize=True).plot(kind='bar', figsize=(15,5), ax=ax2)
    ax2.set_title('Test set')
    print('For Train data set', '\n', data[col].value_counts(normalize='True'), '\n')
    print('For Test data set', '\n',test[col].value_counts(normalize='True'))
    
    return 0

comp_plot('Sex')

In [None]:
comp_plot('Pclass')

In [None]:
Sur = data.Survived.value_counts(normalize='True')
print(Sur)
data['Survived'].value_counts(normalize=True).plot(kind='bar')

#only 38% of people survived

In [None]:
fig, (ax1, ax2)  = plt.subplots(1, 2, sharey=True, figsize=(15,5))
data.hist(column='Age', ax=ax1)
ax1.set_title('Histogram of Age-Train set')
test.hist(column='Age', ax=ax2)
ax2.set_title('Histogram of Age-Test set')

In [None]:
fig, (ax1, ax2)  = plt.subplots(1, 2, sharey=True, figsize=(15,5))
data.hist(column='Fare', ax=ax1)
ax1.set_title('Histogram of Fare-Train set')
test.hist(column='Fare', ax=ax2)
ax2.set_title('Histogram of Fare-Test set')

In [None]:
fig, (ax1, ax2)  = plt.subplots(1, 2, figsize=(15,5))
sns.scatterplot(x='Pclass', y='Age', hue='Survived', data=data, ax=ax1)
sns.scatterplot(x='Pclass', y='Fare', hue='Survived', size='Survived', data=data, ax=ax2)

In [None]:
fig, (ax1, ax2)  = plt.subplots(1, 2, figsize=(15,5))
sns.scatterplot(x='Survived', y='Fare', data=data, ax=ax1)
sns.scatterplot(x='Survived', y='Age', data=data, ax=ax2)

In [None]:
df_sur_gen = data.groupby(['Sex'])['Survived'].value_counts(normalize = 'True').unstack('Survived')
print(df_sur_gen)
df_sur_gen.plot(kind='bar', stacked=True)

In [None]:
df_class = data.groupby(['Pclass'])['Survived'].value_counts(normalize=True).unstack('Survived')
print(df_class)
df_class.plot(kind='bar', stacked=True)
# 62% of class 1 people survived
# 47% of class 2 people survived
# 24% of class 3 people survived

In [None]:
data.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
mean_age = data.groupby(['Sex','Pclass'])['Age'].mean()
mean_age.reset_index(name = 'm_Age')

In [None]:
def fill_Ages(row):
    if pd.isnull(row['Age']):
        return mean_age[row['Sex'],row['Pclass']]
    else:
        return row['Age']

data['Age'] =data.apply(fill_Ages, axis=1)

In [None]:
mean_age = test.groupby(['Sex','Pclass'])['Age'].mean()
mean_age.reset_index(name = 'm_Age')

test['Age'] =test.apply(fill_Ages, axis=1)

test.Fare.fillna(test.Fare.mean(),inplace=True)

In [None]:
data.Cabin.fillna('Unknown', inplace=True)
test.Cabin.fillna('Unknown', inplace=True)

In [None]:
data.Embarked.value_counts()

In [None]:
data.Embarked.fillna('S', inplace=True)

In [None]:
data['Title'] = data.Name.str.extract(r',\s*([^\.]*)\s*\.',expand=False)

test['Title'] = test.Name.str.extract(r',\s*([^\.]*)\s*\.',expand=False)

data['Title'].value_counts()

In [None]:
# noble - 1
# Mrs - 2
# Miss - 3
# Mr - 4
# Workers - 5

title_map = {'Lady':1, 'Master':1, 'the Countess':1, 'Jonkheer':1, 'Sir':1, 'Don':1, 'Dr':1,
             'Mrs':2, 'Mme':2, 'Miss':3, 'Mlle':3, 'Ms':3,
             'Mr':4, 'Capt': 5, 'Col':5, 'Major':5, 'Rev':5 }

data.Title = data.Title.map(title_map)

test.Title = test.Title.map(title_map)

data.drop('Name', axis=1, inplace=True) 
test.drop('Name', axis=1, inplace=True)

In [None]:
# less than 50 - 1
# less than 100 - 2
# less than 150 - 3
# else - 3

def Fare_group(fare):
    a = 0
    if (fare <= 50):
        a = 1
    
    elif (fare <= 100):
        a = 2
    
    elif (fare <=150):
        a = 3
        
    else:
        a = 4

    return a


data['Fare Group'] = data.Fare.map(Fare_group)
#data.drop('Fare', axis=1, inplace=True)

test['Fare Group'] = test.Fare.map(Fare_group)  
#test.drop('Fare', axis=1, inplace=True)

In [None]:
# below 10 - 1
# below 20 - 2
# below 40 - 3
# below 80 - 4

def Age_group(age):
    a = 0
    if (age <= 10):
        a = 1 
    
    elif (age <= 20):
        a = 2
    
    elif (age <=40):
        a = 3
        
    else:
        a = 4

    return a

data['Age Group'] = data.Age.map(Age_group)
#data.drop('Age', axis=1, inplace=True)

test['Age Group'] = test.Age.map(Age_group)  
#test.drop('Age', axis=1, inplace=True)

In [None]:
data['Sex'] = data.Sex.apply(lambda x:1 if x=='female' else 2)   #converting column Sex into int format
test['Sex'] = test.Sex.apply(lambda x:1 if x=='female' else 2)

In [None]:
# S - 1
# C - 2
# Q - 3

data['Embarked'] = data.Embarked.apply(lambda x:1 if x=='S' else (2 if x=='C' else 3))
test['Embarked'] = test.Embarked.apply(lambda x:1 if x=='S' else (2 if x=='C' else 3))

In [None]:
data['No_fam_mem'] = data['SibSp'] + data['Parch']
data.drop(['SibSp','Parch', 'Ticket'], axis=1, inplace=True)

test['No_fam_mem'] = test['SibSp'] + test['Parch']
test.drop(['SibSp','Parch', 'Ticket'], axis=1, inplace=True)

In [None]:
data.No_fam_mem.value_counts()

In [None]:
# travelling alone - 1
# small fam - 2
# large fam - 3
def fam_type(fam_size):
    a = 0
    if (fam_size==0):
        a = 1
    
    elif (fam_size<= 5):
        a = 2
    
    else:
        a = 3

    return a

data['Fam size'] = data.No_fam_mem.map(fam_type)

data.drop('No_fam_mem', axis=1, inplace=True)

data = data[['PassengerId', 'Pclass', 'Title', 'Sex', 'Age', 'Age Group', 'Fam size', 'Fare', 'Fare Group', 'Embarked', 'Cabin', 'Survived']]



test['Fam size'] = test.No_fam_mem.map(fam_type)

test.drop('No_fam_mem', axis=1, inplace=True)

test = test[['PassengerId', 'Pclass', 'Title', 'Sex', 'Age', 'Age Group', 'Fam size', 'Fare', 'Fare Group', 'Embarked', 'Cabin']]

In [None]:
data.Cabin.value_counts()         #letter of cabin represent the deck

In [None]:
data.Cabin = data.Cabin.map(lambda x: x[0])
test.Cabin = test.Cabin.map(lambda x: x[0])

data.Cabin.value_counts()

In [None]:
# Unknown-1, A-2, B-3, C-4, D-5, E-6, F-7, G-8, T-9

deck_map = {'U':1, 'A':2, 'B':3, 'C':4, 'D':5, 'E':6, 'F':7, 'G':8, 'T':9}

data['Deck'] = data['Cabin']
data.Deck = data.Deck.map(deck_map)

data.drop('Cabin', axis=1, inplace=True)

test['Deck'] = test['Cabin']
test.Deck = test.Deck.map(deck_map)

test.drop('Cabin', axis=1, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
test.Title.fillna(3, inplace=True)

In [None]:
data['class_age'] = data['Pclass']*data['Age']

data['class_title'] = data['Pclass']*data['Title']
data['class_gen'] = data['Pclass']*data['Sex']

data['fam_fare'] = data['Fam size']*data['Fare']

data['em_fare'] = data['Embarked']*data['Fare']

data['title_age'] = data['Title']*data['Age']


test['class_age'] = test['Pclass']*test['Age']

test['class_title'] = test['Pclass']*test['Title']
test['class_gen'] = test['Pclass']*test['Sex']

test['fam_fare'] = test['Fam size']*test['Fare']

test['em_fare'] = test['Embarked']*test['Fare']

test['title_age'] = test['Title']*test['Age']

In [None]:
data.head()

In [None]:
data.drop(['Age', 'Fare'], axis=1, inplace=True)
test.drop(['Age', 'Fare'], axis=1, inplace=True)

In [None]:
cols = data.columns.tolist()
print(cols)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
col_lst = [ 'Pclass', 'Title', 'Age Group', 'Fam size', 'Fare Group', 'Embarked',
           'Deck', 'class_age', 'class_title', 'class_gen', 'fam_fare', 'em_fare', 'title_age']

data[col_lst] = scaler.fit_transform(data[col_lst])
test[col_lst] = scaler.fit_transform(test[col_lst])

In [None]:
data = data[['Pclass', 'Title', 'Sex', 'Age Group', 'Fam size', 'Fare Group', 'Embarked', 'Deck', 'class_age', 
             'class_title', 'class_gen', 'fam_fare', 'em_fare', 'title_age', 'Survived']]

test = test[['PassengerId', 'Pclass', 'Title', 'Sex', 'Age Group', 'Fam size', 'Fare Group', 'Embarked', 'Deck', 'class_age', 
             'class_title', 'class_gen', 'fam_fare', 'em_fare', 'title_age']]

In [None]:
corr_mat = data.corr()
np.tril(np.ones(corr_mat.shape)).astype(np.bool)[0:5,0:5]
df_lt = corr_mat.where(np.tril(np.ones(corr_mat.shape)).astype(np.bool))
plt.subplots(figsize=(15,10))
sns.heatmap(df_lt, annot=True, cmap="Spectral", fmt='.2g')

# ************************************

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score
#GridSearchCV - for selecting the best hyperparameter
#StratifiedKFold  - divide categories in a uniform way
from sklearn.metrics import accuracy_score

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)  #StratifiedKFold

#data_1 = data.drop('PassengerId', axis=1).copy()
test_2 = test.drop('PassengerId', axis=1).copy()

target = data['Survived']
train = data.drop('Survived', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=1)   #Split the train data set
#train_test_split(X, Y, test_size, random_state)
# X,Y - dataset we are going to use for splitiing
#test_size - define the size of the test set
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#model.fit() is used to train the model on data
#if y_test is the real labels for X_test, model.score(X_test, y_test)  compare predictions of the model against the real labels
#model.score(X_train, y_train) measure the accuracy of the model against training data.This has nothing to do with test data
#model.predict(X_test) predict labels for test set 
#model.score(X_test, y_test) and model.predict(X_test), accuracy_score(y_test, prediction) are both same

In [None]:
def model_score(model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    model.fit(X_train, y_train)
    model_score = model.score(X_test, y_test)*100
    
    return model_score

In [None]:
def cv_score(model):
    cv_score = cross_val_score(model, train, target, cv=kf, scoring='accuracy')
    return cv_score.mean()*100

In [None]:

print('Cross val score for LR  : ', cv_score(LogisticRegression()))
print('LR Score : ', model_score(LogisticRegression()),'\n')

print('Cross val score for RF  : ', cv_score(RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=2, 
                                                    min_samples_leaf=6, max_features='auto', random_state=1)))
print('RF Score : ', model_score(RandomForestClassifier(n_estimators=100, max_depth=15, min_samples_split=6, min_samples_leaf=6, 
                                    max_features='auto', random_state=1)), '\n')

print('Cross val score for SVC : ', cv_score(SVC(C=30)))
print('SVC Score : ', model_score(SVC(C=30)), '\n')

print('Cross val score for KNN : ', cv_score(KNeighborsClassifier(n_neighbors=50)))
print('KNN Score : ', model_score(KNeighborsClassifier(n_neighbors=50)), '\n')

print('Cross val score for DT  : ', cv_score(DecisionTreeClassifier(max_depth=12, min_samples_split=2, random_state=1)))
print('DT Score : ', model_score(DecisionTreeClassifier(max_depth=12, min_samples_split=2, random_state=1)), '\n')

#Cross val score for LR  :  81.48189762796505
#LR Score :  77.98507462686567 

#Cross val score for RF  :  83.2808988764045
#RF Score :  77.61194029850746 

#Cross val score for SVC :  81.82272159800249
#SVC Score :  77.98507462686567 

#Cross val score for KNN :  80.13732833957553
#KNN Score :  75.74626865671642 

#Cross val score for DT  :  82.61173533083645
#DT Score :  79.1044776119403

In [None]:
from xgboost import XGBClassifier, plot_importance

xgbc = XGBClassifier(max_depth=15, min_child_weight=1, n_estimators=500, random_state=42, learning_rate=0.01,  
                     eval_metric=["error", "logloss"])
xgbc.fit(X_train,y_train, early_stopping_rounds=15, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)   
# verbose=True print val_error and logloss for each iteration

In [None]:
y_pred_xgbc = xgbc.predict(X_test)

In [None]:
xgbc_score_train = xgbc.score(X_train, y_train)
print("Train Prediction Score",xgbc_score_train*100)
xgbc_score_test = accuracy_score(y_test,y_pred_xgbc)    # or print(xgbc.score(X_test, y_test)*100)
print("Test Prediction Score",xgbc_score_test*100)

In [None]:
results = xgbc.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()
# plot classification error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
plt.ylabel('Classification Error')
plt.title('XGBoost Classification Error')
plt.show()

In [None]:
plot_importance(xgbc)
plt.show()

In [None]:
xgbc.fit(train, target)

prediction_xgbc = xgbc.predict(test_2)

In [None]:
model=RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=2,min_samples_leaf=6, max_features='auto', random_state=1)
model.fit(train, target)

pred_dt = model.predict(test_2)

In [None]:
model = SVC( C=20)
model.fit(train, target)

pred_svc = model.predict(test_2)

In [None]:
sub = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived':pred_svc})
sub.to_csv('sample_submission.csv', index=False)