In [None]:
from IPython.core.display import HTML

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np

pd.options.display.max_rows = 100

data = pd.read_csv(r'/home/manuwas/titanic/train.csv')
data.head()

#Fill age with median
data['Age'].fillna(data['Age'].median(), inplace=True)

survived_sex = data[data['Survived']==1]['Sex'].value_counts()
dead_sex = data[data['Survived']==0]['Sex'].value_counts()
df = pd.DataFrame([survived_sex,dead_sex])
df.index = ['Survived','Dead']

In [None]:
# Feature engineering .. 

def status(feature):

    print ('Processing',feature,': ok')

In [None]:
def get_combined_data():
    # reading train data
    train = pd.read_csv(r'/home/manuwas/titanic/train.csv')
    
    # reading test data
    test = pd.read_csv(r'/home/manuwas/titanic/test.csv')

    # extracting and then removing the targets from the training data 
    targets = train.Survived
    train.drop('Survived',1,inplace=True)
    

    # merging train data and test data for future feature engineering
    combined = train.append(test)
    combined.reset_index(inplace=True)
    combined.drop('index',inplace=True,axis=1)
    
    return combined

In [None]:
combined = get_combined_data()
combined.shape

In [None]:
combined.head(5)

In [None]:
def get_titles():

    global combined
    
    # we extract the title from each name
    combined['Title'] = combined['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # a map of more aggregated titles
    Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"

                        }
    
    # we map each title
    combined['Title'] = combined.Title.map(Title_Dictionary)

In [None]:
get_titles()
combined.head(5)

In [None]:
grouped = combined.groupby(['Sex','Pclass','Title'])
grouped.median()

In [None]:
# This function drops the Name column since we won't be using it anymore because we created a Title column.
#Then we encode the title values using a dummy encoding.

def process_names():
    
    global combined
    # we clean the Name variable
    combined.drop('Name',axis=1,inplace=True)
    
    # encoding in dummy variable
    titles_dummies = pd.get_dummies(combined['Title'],prefix='Title')
    combined = pd.concat([combined,titles_dummies],axis=1)
    
    # removing the title variable
    combined.drop('Title',axis=1,inplace=True)
    
    status('names')
process_names()
combined.head()

In [None]:
#Process fare

def process_fares():
    
    global combined
    # there's one missing fare value - replacing it with the mean.
    combined.Fare.fillna(combined.Fare.mean(),inplace=True)
    
    status('fare')
    
process_fares()

In [None]:
#This functions replaces the two missing values of Embarked with the most frequent Embarked value.
def process_embarked():
    
    global combined
    # two missing embarked values - filling them with the most frequent one (S)
    combined.Embarked.fillna('S',inplace=True)
    
    # dummy encoding 
    embarked_dummies = pd.get_dummies(combined['Embarked'],prefix='Embarked')
    combined = pd.concat([combined,embarked_dummies],axis=1)
    combined.drop('Embarked',axis=1,inplace=True)
    
    status('embarked')

process_embarked()

In [None]:
# This function replaces NaN values with U (for Unknow). It then maps each Cabin value to the first letter.
#Then it encodes the cabin values using dummy encoding again.

def process_cabin():
    
    global combined
    
    # replacing missing cabins with U (for Uknown)
    combined.Cabin.fillna('U',inplace=True)
    
    # mapping each Cabin value with the cabin letter
    combined['Cabin'] = combined['Cabin'].map(lambda c : c[0])
    
    # dummy encoding ...
    cabin_dummies = pd.get_dummies(combined['Cabin'],prefix='Cabin')
    
    combined = pd.concat([combined,cabin_dummies],axis=1)
    
    combined.drop('Cabin',axis=1,inplace=True)
    
    status('cabin')
    
process_cabin()
combined.info()

In [None]:
def process_sex():
    
    global combined
    # mapping string values to numerical one 
    combined['Sex'] = combined['Sex'].map({'male':1,'female':0})
    
    status('sex')
process_sex()

In [None]:
def process_pclass():
    
    global combined
    # encoding into 3 categories:
    pclass_dummies = pd.get_dummies(combined['Pclass'],prefix="Pclass")
    
    # adding dummy variables
    combined = pd.concat([combined,pclass_dummies],axis=1)
    
    # removing "Pclass"
    
    combined.drop('Pclass',axis=1,inplace=True)
    
    status('pclass')
    
process_pclass()

In [None]:
#This functions preprocess the tikets first by extracting the ticket prefix. When it fails in extracting a prefix it returns XXX.
#Then it encodes prefixes using dummy encoding.

def process_ticket():
    
    global combined
    
    # a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip() , ticket)
        ticket = list(filter(lambda t : not t.isdigit(), ticket))
        if len(ticket) > 0:
            return ticket[0]
        else: 
            return 'XXX'
    

    # Extracting dummy variables from tickets:

    combined['Ticket'] = combined['Ticket'].map(cleanTicket)
    tickets_dummies = pd.get_dummies(combined['Ticket'],prefix='Ticket')
    combined = pd.concat([combined, tickets_dummies],axis=1)
    combined.drop('Ticket',inplace=True,axis=1)

    status('ticket')
    
process_ticket()

In [None]:
#Processing family
#This part includes creating new variables based on the size of the family (the size is by the way, another variable we create).
#This creation of new variables is done under a realistic assumption: Large families are grouped together, hence they are more likely to get rescued than people traveling alone.

def process_family():
    
    global combined
    # introducing a new feature : the size of families (including the passenger)
    combined['FamilySize'] = combined['Parch'] + combined['SibSp'] + 1
    
    # introducing other features based on the family size
    combined['Singleton'] = combined['FamilySize'].map(lambda s : 1 if s == 1 else 0)
    combined['SmallFamily'] = combined['FamilySize'].map(lambda s : 1 if 2<=s<=4 else 0)
    combined['LargeFamily'] = combined['FamilySize'].map(lambda s : 1 if 5<=s else 0)
    
    status('family')

# This function introduces 4 new features:
# FamilySize : the total number of relatives including the passenger (him/her)self.
# Sigleton : a boolean variable that describes families of size = 1
# SmallFamily : a boolean variable that describes families of 2 <= size <= 4
# LargeFamily : a boolean variable that describes families of 5 < size

process_family()
combined.shape
combined.head()

In [None]:
#All None Ages
nan_age = combined[combined['Age'].isnull()] 

#All not null ages
combined_agep = combined[combined['Age'].notnull()]
combined_agep.describe()

In [None]:
import copy
train = combined_agep
test = nan_age

targets = copy.deepcopy(train.Age)
train.drop('Age', axis=1, inplace = True)
test.drop('Age', axis = 1, inplace = True)

In [None]:
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()

# gnbfit = gnb.fit(train.astype(int), targets.astype(int))
# predicted = gnbfit.predict(test.astype(int))
# print type(predicted)
# print len(predicted)
# print predicted
# new_df = pd.Series.to_frame(answers)

In [None]:
# from sklearn.neural_network import MLPClassifier
# clf = MLPClassifier(hidden_layer_sizes=(15,), random_state=1, max_iter=1, warm_start=True)
# for i in range(2000000):
#     clf.fit(train.astype(int), targets.astype(int))
                    
# predicted = clf.predict(test.astype(int))

In [None]:

#df_predicted = pd.DataFrame(np.array(predicted[:]), index = test.index,  columns = ["Predicted_Age"])
# df_predicted.describe()
#df_predicted

In [None]:
combined.Age.fillna(combined['Age'].median(), inplace = True)
combined.head()

In [None]:
#scale all features

def scale_all_features():
    
    global combined
    
    features = list(combined.columns)
    features.remove('PassengerId')
    combined[features] = combined[features].apply(lambda x: x/x.max(), axis=0)
    
    print ('Features scaled successfully !')
    
scale_all_features()

#  Modeling 

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score

In [None]:
#To evaluate our model we'll be using a 5-fold cross validation with the Accuracy metric.
#To do that, we'll define a small scoring function.

def compute_score(clf, X, y,scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5,scoring=scoring)
    return np.mean(xval)

In [None]:
#Recovering the train set and the test set from the combined dataset is an easy task.
# train0 = pd.read_csv('/home/rahulmanuwas/Dropbox/Kaggle/train.csv')

def recover_train_test_target():
    global combined
    
    train0 = pd.read_csv('/home/manuwas/titanic/train.csv')
    
    targets = train0.Survived
    train = combined.ix[0:890]
    test = combined.ix[891:]
    
    return train,test,targets

train,test,targets = recover_train_test_target()

In [None]:
#Feature Selection
#In fact, feature selection comes with many benefits:
#It decreases redundancy among the data
#It speeds up the training process
#It reduces overfitting
#Tree-based estimators can be used to compute feature importances, which in turn can be used to discard irrelevant features.

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(train, targets)

In [None]:
features = pd.DataFrame()
features['feature'] = train.columns
features['importance'] = clf.feature_importances_

features.sort_values(by = ['importance'], ascending=False)

# As you may notice, there is a great importance linked to Title_Mr, Age, Fare, and Sex.
# There is also an important correlation with the Passenger_Id.
# Let's now transform our train set and test set in a more compact datasets.

In [None]:
model = SelectFromModel(clf, prefit=True)
train_new = model.transform(train)
train_new.shape

In [None]:
test_new = model.transform(test)
test_new.shape

In [None]:
# Hyperparameters tuning
# Random Forest 
forest = RandomForestClassifier(max_features='sqrt')

parameter_grid = {
                 'max_depth' : [4,5,6,7,8],
                 'n_estimators': [200,210,240,250],
                 'criterion': ['gini','entropy']
                 }

cross_validation = StratifiedKFold(targets, n_folds=5)

grid_search = GridSearchCV(forest,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_new, targets)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
# Now that the model is built by scanning several combinations of the hyperparameters, we can generate an output file to submit on Kaggle.

output = grid_search.predict(test_new).astype(int)
df_output = pd.DataFrame()
df_output['PassengerId'] = test['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('/home/manuwas/result_a_RF.csv',index=False)

In [None]:
# from sklearn.neural_network import MLPClassifier
# clf = MLPClassifier(hidden_layer_sizes=(15,), random_state=1, max_iter=10, warm_start=True)
# for i in range(20000):
#     clf.fit(train_new, targets)
                    
# output = clf.predict(test_new).astype(int)

# df_output = pd.DataFrame()
# df_output['PassengerId'] = test['PassengerId']
# df_output['Survived'] = output
# df_output[['PassengerId','Survived']].to_csv('/home/manuwas/result_a_ANN.csv',index=False)

In [None]:
#Logistic Regression

# import the class
from sklearn.linear_model import LogisticRegression
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(train_new, targets)


# predict the response values for the observations in X
output = logreg.predict(test_new).astype(int)

df_output = pd.DataFrame()
df_output['PassengerId'] = test['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('/home/manuwas/result_a_LR.csv',index=False)