# Titanic Kaggle dataset

### First goal - Get a generalized prediction as fast as possible

In [459]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import roc_auc_score

import pandas as pd

X = pd.read_csv("./train.csv")
test = pd.read_csv('test.csv')

In [460]:
X.describe()

#As we can see here age column has a lot of values missing which can be further seen through the following command
X[X['Age'].isnull()]

# We can do a lot of things to fix this missing data problem.
# 1. Predict the age using some machine learning algorithm (Best case)
# 2. or we can fill the values with a mean/avg value (Simple)
# 3. Omit these observations from our dataset but we already have very less of them and doing this will leave us with very 
#    small dataset

# Here in this project for the sake of simplicity we will fill the null values with the mean age value

# Instead of filling with avg of all people on the ship we can take avg age on the basis of title in the name

X['Initial']=0
test['Initial']=0
for i in X:
    X['Initial']=X.Name.str.extract('([A-Za-z]+)\.')
    test['Initial']=test.Name.str.extract('([A-Za-z]+)\.')
    
#Replace with common initials
X['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mrs'],inplace=True)
X.groupby('Initial')['Age'].mean()

## Assigning the NaN Values with the Ceil values of the mean ages
X.loc[(X.Age.isnull())&(X.Initial=='Mr'),'Age']=33
X.loc[(X.Age.isnull())&(X.Initial=='Mrs'),'Age']=36
X.loc[(X.Age.isnull())&(X.Initial=='Master'),'Age']=5
X.loc[(X.Age.isnull())&(X.Initial=='Miss'),'Age']=22
X.loc[(X.Age.isnull())&(X.Initial=='Other'),'Age']=46

test.groupby('Initial')['Age'].mean()
test.loc[(test.Age.isnull())&(test.Initial=='Mr'),'Age']=33
test.loc[(test.Age.isnull())&(test.Initial=='Mrs'),'Age']=39
test.loc[(test.Age.isnull())&(test.Initial=='Master'),'Age']=8
test.loc[(test.Age.isnull())&(test.Initial=='Miss'),'Age']=22
test.loc[(test.Age.isnull())&(test.Initial=='Other'),'Age']=43

# X['Age_band']=0
# X.loc[X['Age']<=16,'Age_band']=0
# X.loc[(X['Age']>16)&(X['Age']<=32),'Age_band']=1
# X.loc[(X['Age']>32)&(X['Age']<=48),'Age_band']=2
# X.loc[(X['Age']>48)&(X['Age']<=64),'Age_band']=3
# X.loc[X['Age']>64,'Age_band']=4

# test['Age_band']=0
# test.loc[test['Age']<=16,'Age_band']=0
# test.loc[(test['Age']>16)&(test['Age']<=32),'Age_band']=1
# test.loc[(test['Age']>32)&(test['Age']<=48),'Age_band']=2
# test.loc[(test['Age']>48)&(test['Age']<=64),'Age_band']=3
# test.loc[test['Age']>64,'Age_band']=4




In [461]:
X.drop(['Name','Ticket','Cabin','Initial','PassengerId'],axis=1,inplace=True)
test.drop(['Name','Ticket','Cabin','Initial'],axis=1,inplace=True)
test['Fare'].fillna(test.Fare.mean(),inplace=True)

In [462]:
X.dropna(subset=['Embarked'],inplace=True)
y = X.pop('Survived')

In [463]:
categorical_variables = ['Sex','Embarked']

for variable in categorical_variables:
    
    X[variable].fillna("Missing",inplace=True)
    dummies = pd.get_dummies(X[variable],prefix=variable)
    X = pd.concat([X,dummies],axis=1)
    X.drop([variable],axis=1,inplace=True)
    ## For submission testing
    test[variable].fillna("Missing",inplace=True)
    dummies = pd.get_dummies(test[variable],prefix=variable)
    test = pd.concat([test,dummies],axis=1)
    test.drop([variable],axis=1,inplace=True)
    
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,1,0,0,1
5,3,33.0,0,0,8.4583,0,1,0,1,0
6,1,54.0,0,0,51.8625,0,1,0,0,1
7,3,2.0,3,1,21.0750,0,1,0,0,1
8,3,27.0,0,2,11.1333,1,0,0,0,1
9,2,14.0,1,0,30.0708,1,0,1,0,0


In [464]:
model = RandomForestRegressor(100,oob_score=True,n_jobs=-1,random_state=42)

model.fit(X,y)

print (roc_auc_score(y,model.oob_prediction_))


0.859870888246


### Important Features

In [465]:
model.feature_importances_

array([ 0.10652127,  0.24453414,  0.05095801,  0.02189126,  0.23866593,
        0.13789445,  0.16742075,  0.00962644,  0.00668806,  0.01579968])

In [466]:
# Creating submission file
S = pd.DataFrame()
S = test['PassengerId']
test.drop(['PassengerId'],axis=1,inplace=True)
pred = model.predict(test)

for i in range(0,len(pred)):
    if (pred[i]>=0.5):
        pred[i]=1
    else:
        pred[i]=0
pred = pd.DataFrame(pred)
pred.columns = ["Survived"]
S = pd.concat([S,pd.DataFrame(pred)],axis=1)

S.to_csv("output.csv")

PermissionError: [Errno 13] Permission denied: 'output.csv'