In [1]:
import pandas as pd
import numpy as np


In [2]:
##train = pd.read_csv('../input/train.csv')
##test = pd.read_csv('../input/test.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test['Survived'] = np.nan

In [3]:
data=train.append(test,ignore_index=True,sort=False)

In [4]:
##Add missing fare to passangerID 1044 and create new Fare_bin feature
data['Fare'].fillna(8, inplace = True) 

In [5]:
data['Ticket_count'] = data.Ticket.apply(lambda x: data[data['Ticket']==x].shape[0] )
data['Fare_tickect']= data.apply(lambda x: x.Fare/x.Ticket_count,axis=1 )
data['Fare_bin'] = pd.qcut(data['Fare_tickect'], 4,labels=('Fare_bin1','Fare_bin2','Fare_bin3','Fare_bin4'))

In [6]:
## New Title feature created 
data['Title']=data['Name'].str.split(', ').str[1].str.split('.').str[0]
data['Title'] = data['Title'].replace(['Ms','Mlle'], 'Miss')
data['Title'] = data['Title'].replace(['Mme'], 'Mrs')
data['Title'] = data['Title'].replace(['Dona','Dr','Rev','the Countess','Capt','Lady','Sir','Jonkheer','Don','Major','Col'], 'Rare')

In [7]:
##Add missing Age bases on Pclass and create new Age_bin feature
def Cal_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

data['Age'] = data[['Age','Title']].apply(Cal_age,axis=1)
data['Age_bin'] = pd.cut(data['Age'].astype(int), 5, labels=('Age_bin1','Age_bin2','Age_bin3','Age_bin4','Age_bin5'))

In [8]:
## New Family size feature
def Cal_Family_bin(cols):
    FamilyZize = cols[0] +cols[1]
    if FamilyZize == 0:
        return 'Alone'
    elif 1 <= FamilyZize <= 3:
        return 'Family'
    elif FamilyZize >= 4:
        return 'Big_family'
    
data['Family_type'] = data[['SibSp','Parch']].apply(Cal_Family_bin,axis=1)

In [9]:
# Missing Embarked info.
data.loc[data['Embarked'].isnull(), 'Embarked'] = 'S'

In [10]:
## Two New Feature creates.
##Family_wit_FC_dead = Passanger with a dead female/child in the family.
##Family_wit_M_alive = Passanger with at least one male/no_child survided in the family.


#Familyname feature created from Name and Fare
data['Family_name']=data['Name'].str.split(', ').str[0] + '-' + data['Fare'].astype(str)

# Families with a female or child no survive.
list1=data[((data['Sex']=='female') | (data['Age']<14)) & (data['Survived']==0) ]['Family_name'].tolist()

# Families with male no child survive.
list2=data[(data['Sex']=='male') & (data['Age']>18) & (data['Survived']==1)]['Family_name'].tolist()

In [11]:
def FC_dead(row):
    if row['Family_name'] in list1:
        return 1
    else:
        return 0

In [12]:
def M_Alive(row):
    if row['Family_name'] in list2:
        return 1
    else:
        return 0

In [13]:
data['Family_wit_FC_dead']=data.apply(FC_dead, axis=1)
data['Family_wit_M_alive']=data.apply(M_Alive, axis=1)

In [14]:
data['Family_Name_size'] = data.Family_name.apply(lambda x: data[data['Family_name']==x].shape[0] )

In [15]:
Fare_bin = pd.get_dummies(data['Fare_bin'])
Pclass_bin = pd.get_dummies(data['Pclass'],prefix ='Class')
Title_bin = pd.get_dummies(data['Title'])
Sex_bin = pd.get_dummies(data['Sex'],drop_first=True,prefix ='Sex')
Age_bin = pd.get_dummies(data['Age_bin'])
Family_type = pd.get_dummies(data['Family_type'])
Embarked_bin = pd.get_dummies(data['Embarked'],prefix ='Embarked')
Family_wit_FC_dead=data['Family_wit_FC_dead'].astype(np.uint8)
Family_wit_M_alive=data['Family_wit_M_alive'].astype(np.uint8)

In [16]:
data_cleaned = pd.concat([data['Survived'],Fare_bin,Pclass_bin,Title_bin,Sex_bin,Age_bin,Family_type,Embarked_bin,Family_wit_FC_dead,Family_wit_M_alive],axis=1)

In [17]:
train_cleaned = data_cleaned[data['Survived'].notnull()]
test_cleaned = data_cleaned[data['Survived'].isnull()]

In [18]:
test_cleaned.drop('Survived',axis=1,inplace=True)
PassId =test['PassengerId']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


### 4. Building the model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix,mean_absolute_error

In [28]:
X = train_cleaned.drop('Survived',axis=1)
y = train_cleaned['Survived']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Random Forests

In [30]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
rfc_predictions = rfc.predict(X_test)

In [32]:
print(confusion_matrix(y_test,rfc_predictions))

[[171   2]
 [  2  93]]


In [33]:
print(classification_report(y_test,rfc_predictions))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       173
         1.0       0.98      0.98      0.98        95

   micro avg       0.99      0.99      0.99       268
   macro avg       0.98      0.98      0.98       268
weighted avg       0.99      0.99      0.99       268



In [34]:
acc_rfc = str(round(accuracy_score(rfc_predictions, y_test) * 100, 2))
MAE_rfc=str(mean_absolute_error(rfc_predictions, y_test))

print("Mean Absolute Error : " + MAE_rfc)
print("accuracy_score : " + acc_rfc)

Mean Absolute Error : 0.014925373134328358
accuracy_score : 98.51


In [37]:
rfc.fit(X, y)
rfc_predictions_test = rfc.predict(test_cleaned).astype(np.uint8)

In [38]:
output_rfc = pd.DataFrame({ 'PassengerId' : PassId, 'Survived': rfc_predictions_test })
output_rfc.to_csv('submission-rfc.csv', index=False)