## 1. Reading/Importing the Data

In [1]:
# data processing and analysis
import pandas as pd 
print("pandas version: {}". format(pd.__version__))

# scientific computing
import numpy as np 
print("NumPy version: {}". format(np.__version__))

# scientific and publication-ready visualization
import matplotlib 
print("matplotlib version: {}". format(matplotlib.__version__))

# scientific and publication-ready visualization 
import seaborn as sns
print("seaborn version: {}". format(sns.__version__))

# machine learning algorithms
import sklearn 
print("scikit-learn version: {}". format(sklearn.__version__))

# machine learning algorithms
import statsmodels
print("statsmodels version: {}". format(statsmodels.__version__))

# scientific computing and advance mathematics
import scipy as sp 
print("SciPy version: {}". format(sp.__version__))

from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix

pandas version: 0.23.4
NumPy version: 1.15.2
matplotlib version: 3.0.0
seaborn version: 0.9.0
scikit-learn version: 0.20.0
statsmodels version: 0.10.1
SciPy version: 1.1.0


In [2]:
#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

### 1.1. Reading Data from any data source (local/online repository)

In [61]:
# import train data from file
data = pd.read_csv('data/train.csv')

# a dataset should be broken into 3 splits: train, test, and (final) validation
# we will split the train set into train and test data in future sections
data_val  = pd.read_csv('data/test.csv')

# to play with our data, create copy
data1 = data.copy(deep = True)

# however passing by reference is convenient, because we can clean both datasets at once
data_cleaner = [data1, data_val]

## 3. Data cleaning and preparation

### 3.5. Checking for Missing Values and Fix/Drop them

In [62]:
for dataset in data_cleaner:    
    # age: median
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)

    # embarked: mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)

    # fare: median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    # drop Cabin as it has 687 as null out of 891 (approx 77% of data)
    dataset.drop('Cabin', axis=1, inplace=True)

### 3.1. Convert binary variable (e.g., Sex: male/female) to 0/1

In [64]:
# List of variables to map

varlist =  ['Sex']

# Defining the map function
def binary_map(x):
    return x.map({'male': 1, "female": 0})

# Applying the function to the housing list
for dataset in data_cleaner:
    dataset[varlist] = dataset[varlist].apply(binary_map)

### 3.2. For categorical variables with multiple levels, create dummy features (one-hot encoded)

In [65]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(data1['Embarked'], prefix='Embarked', drop_first=True)
    
# Adding the results to the master dataframe
data1 = pd.concat([data1, dummy1], axis=1)

In [66]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(data_val['Embarked'], prefix='Embarked', drop_first=True)
    
# Adding the results to the master dataframe
data_val = pd.concat([data_val, dummy1], axis=1)

In [67]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(data1['Pclass'], prefix='Pclass', drop_first=True)
    
# Adding the results to the master dataframe
data1 = pd.concat([data1, dummy1], axis=1)

In [68]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(data_val['Pclass'], prefix='Pclass', drop_first=True)
    
# Adding the results to the master dataframe
data_val = pd.concat([data_val, dummy1], axis=1)

In [69]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(data1['Sex'], prefix='Male', drop_first=True)
    
# Adding the results to the master dataframe
data1 = pd.concat([data1, dummy1], axis=1)

In [70]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(data_val['Sex'], prefix='Male', drop_first=True)
    
# Adding the results to the master dataframe
data_val = pd.concat([data_val, dummy1], axis=1)

### 3.4. Create derived variables

In [71]:
data1['FamilySize'] = data1['SibSp'] + data1['Parch'] + 1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Male_1,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,S,0,1,0,1,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C,0,0,0,0,0,2


In [72]:
data_val['FamilySize'] = data_val['SibSp'] + data_val['Parch'] + 1

### 3.3. Drop repeated/unnecessary Variables

In [73]:
PassengerId = data_val.PassengerId

In [74]:
# Renaming the column 
data1= data1.rename(columns={ 'Male_1' : 'Male'})
data_val= data_val.rename(columns={ 'Male_1' : 'Male'})

In [75]:
drop_column = ['PassengerId','Pclass', 'Name', 'Sex', 'Ticket', 'Fare', 'Embarked']
data1.drop(drop_column, axis=1, inplace = True)

In [76]:
data1.head(2)

Unnamed: 0,Survived,Age,SibSp,Parch,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Male,FamilySize
0,0,22.0,1,0,0,1,0,1,1,2
1,1,38.0,1,0,0,0,0,0,0,2


In [77]:
data_val.drop(drop_column, axis=1, inplace = True)

## 4. Model Building

### 4.3. Build model

In [103]:
from sklearn.model_selection import train_test_split

# Separating target column from other features

target = 'Survived'

y = data1[target]
x = data1.drop(columns = target)

# Train and Test dataset split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 42)


from sklearn.ensemble import RandomForestClassifier

y = data1["Survived"]

#features = ["Age", "Sex_male", "Embarked_S", "Embarked_Q"]
features = ["Age", "Male","Pclass_2", "Pclass_3","FamilySize"]
X = pd.get_dummies(data1[features])
X_test = pd.get_dummies(x_test[features])

In [104]:
from sklearn.metrics import accuracy_score
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)
score = accuracy_score(y_test, predictions)
print("Score: ",score)

Score:  0.832089552238806


In [105]:
X_val = pd.get_dummies(data_val[features])
predictions_test = model.predict(X_val)
output = pd.DataFrame({'PassengerId': PassengerId, 'Survived': predictions_test})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [None]:
Kaggle Score: 0.77990