# Titanic Kaggle dataset

### First goal - Get a generalized prediction as fast as possible

In [342]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import roc_auc_score

import pandas as pd

X = pd.read_csv("./train.csv")
test = pd.read_csv('test.csv')

In [343]:
X.describe()

#As we can see here age column has a lot of values missing which can be further seen through the following command
X[X['Age'].isnull()]

# We can do a lot of things to fix this missing data problem.
# 1. Predict the age using some machine learning algorithm (Best case)
# 2. or we can fill the values with a mean/avg value (Simple)
# 3. Omit these observations from our dataset but we already have very less of them and doing this will leave us with very 
#    small dataset

# Here in this project for the sake of simplicity we will fill the null values with the mean age value

# Instead of filling with avg of all people on the ship we can take avg age on the basis of title in the name

X['Initial']=0
test['Initial']=0
for i in X:
    X['Initial']=X.Name.str.extract('([A-Za-z]+)\.')
    test['Initial']=test.Name.str.extract('([A-Za-z]+)\.')
    
#Replace with common initials
X['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mrs'],inplace=True)
X.groupby('Initial')['Age'].mean()

## Assigning the NaN Values with the Ceil values of the mean ages
X.loc[(X.Age.isnull())&(X.Initial=='Mr'),'Age']=33
X.loc[(X.Age.isnull())&(X.Initial=='Mrs'),'Age']=36
X.loc[(X.Age.isnull())&(X.Initial=='Master'),'Age']=5
X.loc[(X.Age.isnull())&(X.Initial=='Miss'),'Age']=22
X.loc[(X.Age.isnull())&(X.Initial=='Other'),'Age']=46

test.groupby('Initial')['Age'].mean()
test.loc[(test.Age.isnull())&(test.Initial=='Mr'),'Age']=33
test.loc[(test.Age.isnull())&(test.Initial=='Mrs'),'Age']=39
test.loc[(test.Age.isnull())&(test.Initial=='Master'),'Age']=8
test.loc[(test.Age.isnull())&(test.Initial=='Miss'),'Age']=22
test.loc[(test.Age.isnull())&(test.Initial=='Other'),'Age']=43

# X['Age_band']=0
# X.loc[X['Age']<=16,'Age_band']=0
# X.loc[(X['Age']>16)&(X['Age']<=32),'Age_band']=1
# X.loc[(X['Age']>32)&(X['Age']<=48),'Age_band']=2
# X.loc[(X['Age']>48)&(X['Age']<=64),'Age_band']=3
# X.loc[X['Age']>64,'Age_band']=4

# test['Age_band']=0
# test.loc[test['Age']<=16,'Age_band']=0
# test.loc[(test['Age']>16)&(test['Age']<=32),'Age_band']=1
# test.loc[(test['Age']>32)&(test['Age']<=48),'Age_band']=2
# test.loc[(test['Age']>48)&(test['Age']<=64),'Age_band']=3
# test.loc[test['Age']>64,'Age_band']=4




In [344]:
# Getting just the numeric values and leaving categorical values
numeric_variables = list(X.dtypes[X.dtypes!="object"].index)

X[numeric_variables].head()
test['Fare'].fillna(test.Fare.mean(),inplace=True)

In [345]:
def describe_categorical(X):
    
    from IPython.display import display,HTML
    display(HTML(X[X.columns[X.dtypes=="object"]].describe().to_html()))

In [346]:
describe_categorical(X)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Initial
count,891,891,891,204,889,891
unique,891,2,681,147,3,5
top,"Horgan, Mr. John",male,CA. 2343,G6,S,Mr
freq,1,577,7,4,644,529


In [347]:
X.drop(['Name','Ticket','Cabin','Initial'],axis=1,inplace=True)
test.drop(['Name','Ticket','Cabin','Initial'],axis=1,inplace=True)


In [348]:
X.dropna(subset=['Embarked'],inplace=True)
y = X.pop('Survived')

In [349]:
categorical_variables = ['Sex','Embarked']

for variable in categorical_variables:
    
    X[variable].fillna("Missing",inplace=True)
    dummies = pd.get_dummies(X[variable],prefix=variable)
    X = pd.concat([X,dummies],axis=1)
    X.drop([variable],axis=1,inplace=True)
    ## For submission testing
    test[variable].fillna("Missing",inplace=True)
    dummies = pd.get_dummies(test[variable],prefix=variable)
    test = pd.concat([test,dummies],axis=1)
    test.drop([variable],axis=1,inplace=True)
    


In [350]:
model = RandomForestRegressor(100,oob_score=True,n_jobs=-1,random_state=42)

model.fit(X,y)

print (roc_auc_score(y,model.oob_prediction_))


0.859570877531


### Important Features

In [321]:
model.feature_importances_

array([ 0.19309497,  0.10094418,  0.1660558 ,  0.04320487,  0.01367438,
        0.15939857,  0.14644416,  0.14929701,  0.00885668,  0.00595304,
        0.01307635])

In [322]:
# Creating submission file
pred = model.predict(test)
S = pd.DataFrame()
S = test['PassengerId']
for i in range(0,len(pred)):
    if (pred[i]>=0.5):
        pred[i]=1
    else:
        pred[i]=0
pred = pd.DataFrame(pred)
pred.columns = ["Survived"]
S = pd.concat([S,pd.DataFrame(pred)],axis=1)

S.to_csv("output.csv")

In [281]:
test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.50000,0,0,7.8292,0,1,0,1,0
1,893,3,47.00000,1,0,7.0000,1,0,0,0,1
2,894,2,62.00000,0,0,9.6875,0,1,0,1,0
3,895,3,27.00000,0,0,8.6625,0,1,0,0,1
4,896,3,22.00000,1,1,12.2875,1,0,0,0,1
5,897,3,14.00000,0,0,9.2250,0,1,0,0,1
6,898,3,30.00000,0,0,7.6292,1,0,0,1,0
7,899,2,26.00000,1,1,29.0000,0,1,0,0,1
8,900,3,18.00000,0,0,7.2292,1,0,1,0,0
9,901,3,21.00000,2,0,24.1500,0,1,0,0,1
