Data understanding:
    https://www.kaggle.com/c/titanic/data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble
import sklearn.metrics
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import KFold, cross_val_score
import warnings
from time import time


import warnings
from time import time


warnings.simplefilter('ignore')
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [2]:
FILE_LOCATION = "/Users/tarunruchandani/Desktop/Data Science/titanic-kaggle/titanic/"
TRAIN_FILE_NAME = "train.csv"
TEST_FILE_NAME = "test.csv"

TARGET = 'Survived'
IDcol = 'PassengerId'

In [3]:
df_training = pd.read_csv(FILE_LOCATION+TRAIN_FILE_NAME)
df_test = pd.read_csv(FILE_LOCATION+TEST_FILE_NAME)

In [4]:
df_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df_training.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Exploratory Data Analysis

In [6]:
print(df_training.shape)
print(df_test.shape)

(891, 12)
(418, 11)


In [7]:
df_training.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
df_training.describe(include='object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Behr, Mr. Karl Howell",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


Data pre-processing

In [9]:
df_training = df_training.fillna(df_training.mean())
df_test = df_test.fillna(df_test.mean())

In [10]:
cat_columns = df_training.select_dtypes(['object']).columns.tolist()

for var in cat_columns:
    print(var)

Name
Sex
Ticket
Cabin
Embarked


In [11]:
# Label encoding
# --------------
from sklearn.preprocessing import LabelEncoder
cat_columns = df_training.select_dtypes(['object']).columns.tolist()
number = LabelEncoder()

df_training.loc[df_training['Sex'] == "Male"] = 1
df_training.loc[df_training['Sex'] == "male"] = 1

for var in cat_columns:
    df_training[var] = number.fit_transform(df_training[var].astype(str))

df_training[TARGET] = number.fit_transform(df_training[TARGET].astype(str))

df_training.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name             int64
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket           int64
Fare           float64
Cabin            int64
Embarked         int64
dtype: object

In [12]:
df_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,1,0,0,1.0,1,1,0,1.0,0,0
1,2,1,1,74,1,38.0,1,0,217,71.2833,41,1
2,3,1,3,124,1,26.0,0,0,240,7.925,76,3
3,4,1,1,101,1,35.0,1,0,14,53.1,24,3
4,1,1,1,0,0,1.0,1,1,0,1.0,0,0


Modeling

In [13]:
#Choose all predictors except target & IDcols
predictors = [x for x in df_training.columns if x not in [IDcol, TARGET,'Name','Sex','Ticket','Cabin']]

# Converting predictors and outcome to numpy array
x_train = df_training[predictors].values
y_train = df_training[TARGET].values


In [14]:
# --------------------------------------------------------------
# Create object of Logistic Regression and Random Forests models
# --------------------------------------------------------------

model_LR = sklearn.linear_model.LogisticRegression()
model_RF = sklearn.ensemble.RandomForestClassifier()


In [15]:
# Coss-validation
# Simple K-Fold cross validation. 10 folds.
cv = KFold(n_splits=10)

results = []

for traincv, testcv in cv.split(df_training[predictors]):
    model_LR.fit(x_train[traincv, :], y_train[traincv])
    x_test = df_training.ix[testcv, predictors]
    predicted = model_LR.predict(x_test)
    results.append(sum(abs(predicted - df_training.ix[testcv, TARGET].values))/len(testcv))


# Fit model to whole training dataset
model_LR.fit(x_train, y_train)
model_RF.fit(x_train, y_train)

# Predict training set:
# dtrain_predictions_modelLR = model_LR.predict(df_training[predictors])
# dtrain_predictions_modelRF = model_RF.predict(df_training[predictors])

# dtrain_predprob_modelLR = model_LR.predict_proba(df_training[predictors])[:, 1]
# dtrain_predprob_modelRF = model_RF.predict_proba(df_training[predictors])[:, 1]


# Predict test set:
# dtest_predprob_model_LR = model_LR.predict_proba(df_test[predictors])[:, 1]
# dtest_predprob_model_RF = model_RF.predict_proba(df_test[predictors])[:, 1]


# print coefficients from logistic regression
# coefficients = pd.concat([pd.DataFrame(predictors),pd.DataFrame(np.transpose(model_LR.coef_))], axis = 1)
# print("Coefficients of model")
# print(coefficients)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Results