# Titanic Kaggle dataset

### First goal - Get a generalized prediction as fast as possible

In [166]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import roc_auc_score

import pandas as pd

X = pd.read_csv("./train.csv")
y = X.pop('Survived')
test = pd.read_csv('test.csv')

In [167]:
X.describe()

#As we can see here age column has a lot of values missing which can be further seen through the following command
X[X['Age'].isnull()]

# We can do a lot of things to fix this missing data problem.
# 1. Predict the age using some machine learning algorithm (Best case)
# 2. or we can fill the values with a mean/avg value (Simple)
# 3. Omit these observations from our dataset but we already have very less of them and doing this will leave us with very 
#    small dataset

# Here in this project for the sake of simplicity we will fill the null values with the mean age value

X['Age'].fillna(X.Age.mean(),inplace=True)
test['Age'].fillna(test.Age.mean(),inplace=True)


In [168]:
# Getting just the numeric values and leaving categorical values
numeric_variables = list(X.dtypes[X.dtypes!="object"].index)

X[numeric_variables].head()
test['Fare'].fillna(test.Fare.mean(),inplace=True)

In [169]:
model = RandomForestRegressor(n_estimators=100,oob_score=True,random_state=42)

model.fit(X[numeric_variables],y)

# For submission - 
pred = model.predict(test[numeric_variables])
S = pd.DataFrame()
S = test['PassengerId']
for i in range(0,len(pred)):
    if (pred[i]>=0.5):
        pred[i]=1
    else:
        pred[i]=0
pred = pd.DataFrame(pred)
pred.columns = ["Survived"]
S = pd.concat([S,pd.DataFrame(pred)],axis=1)

S.to_csv("output.csv")

In [170]:
model.oob_score_

0.1361695005913669

In [171]:
y_oob = model.oob_prediction_

print (roc_auc_score(y,y_oob))

0.73995515504


In [172]:
def describe_categorical(X):
    
    from IPython.display import display,HTML
    display(HTML(X[X.columns[X.dtypes=="object"]].describe().to_html()))

In [173]:
describe_categorical(X)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Horgan, Mr. John",male,CA. 2343,G6,S
freq,1,577,7,4,644


In [174]:
X.drop(['Name','Ticket','PassengerId'],axis=1,inplace=True)
test.drop(['Name','Ticket','PassengerId'],axis=1,inplace=True)

In [175]:
def clean_cabin(X):
    try:
        return X[0]
    except TypeError:
        return "None"
    
X["Cabin"] = X.Cabin.apply(clean_cabin)
test["Cabin"] = test.Cabin.apply(clean_cabin)

In [176]:
categorical_variables = ['Sex','Cabin','Embarked']

for variable in categorical_variables:
    
    X[variable].fillna("Missing",inplace=True)
    dummies = pd.get_dummies(X[variable],prefix=variable)
    X = pd.concat([X,dummies],axis=1)
    X.drop([variable],axis=1,inplace=True)
    ## For submission testing
    test[variable].fillna("Missing",inplace=True)
    dummies = pd.get_dummies(test[variable],prefix=variable)
    test = pd.concat([test,dummies],axis=1)
    test.drop([variable],axis=1,inplace=True)
    


In [177]:
model = RandomForestRegressor(100,oob_score=True,n_jobs=-1,random_state=42)

model.fit(X,y)

print (roc_auc_score(y,model.oob_prediction_))


0.863521128261


### Important Features

In [178]:
model.feature_importances_

array([  9.11384671e-02,   2.38891052e-01,   4.43567267e-02,
         2.15831071e-02,   2.15047796e-01,   1.43423437e-01,
         1.58822440e-01,   2.95342368e-03,   3.79055011e-03,
         6.47116172e-03,   4.30998991e-03,   8.59480266e-03,
         1.02403226e-03,   8.12054428e-04,   2.67741854e-02,
         6.64265010e-05,   1.06189189e-02,   0.00000000e+00,
         6.00379221e-03,   1.53176370e-02])

In [179]:
# Creating submission file

In [150]:
test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_None,Embarked_C,Embarked_Q,Embarked_S
0,3,34.50000,0,0,7.8292,0,1,0,0,0,0,0,0,0,1,0,1,0
1,3,47.00000,1,0,7.0000,1,0,0,0,0,0,0,0,0,1,0,0,1
2,2,62.00000,0,0,9.6875,0,1,0,0,0,0,0,0,0,1,0,1,0
3,3,27.00000,0,0,8.6625,0,1,0,0,0,0,0,0,0,1,0,0,1
4,3,22.00000,1,1,12.2875,1,0,0,0,0,0,0,0,0,1,0,0,1
5,3,14.00000,0,0,9.2250,0,1,0,0,0,0,0,0,0,1,0,0,1
6,3,30.00000,0,0,7.6292,1,0,0,0,0,0,0,0,0,1,0,1,0
7,2,26.00000,1,1,29.0000,0,1,0,0,0,0,0,0,0,1,0,0,1
8,3,18.00000,0,0,7.2292,1,0,0,0,0,0,0,0,0,1,1,0,0
9,3,21.00000,2,0,24.1500,0,1,0,0,0,0,0,0,0,1,0,0,1
