In [27]:
# imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [38]:
# Notebook variables
SEED_VALUE = 72
TRAIN_SIZE = 0.8
TEST_SIZE = 0.2

TREE_DEPTH = 20
ESTIMATORS = 200


trainDataPath = "../input/train.csv"
testDataPath = "../input/test.csv"

outputPath = "../output/submission.csv"

In [29]:
# Functions

def PreProcessData(data):
  
  ppData = data.copy()

  ordinalColumns = ["Sex"]
  requiredColumns = ["Sex", "Pclass"]

  # Encode required columns
  ordEncoder = OrdinalEncoder()
  ppData[ordinalColumns] = ordEncoder.fit_transform(ppData[ordinalColumns])

  # Drop unwanted columns
  ppData = ppData[requiredColumns]

  return ppData

In [30]:
# Load initial data

trainData = pd.read_csv(trainDataPath, index_col="PassengerId")
trainTarget = trainData.pop("Survived")


### Pre-process training data

We will remove:

- passengerId - not required
- age (we will initially drop this and look at adding it back in later with some missing data estimates)
- ticket
- cabin

We will convert:

- sex - ordinal encoding


In [31]:
# Preprocessing - ordinal for everything

ppData = PreProcessData(trainData)

ppData.head()

Unnamed: 0_level_0,Sex,Pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,3
2,0.0,1
3,0.0,3
4,0.0,1
5,1.0,3


### Creating model

Our first attempt at creating a model will use `Sex`, `Fare` and `Pclass`.

In [32]:
# Train model

model = RandomForestClassifier(n_estimators=ESTIMATORS, random_state=SEED_VALUE, max_depth=TREE_DEPTH)
model.fit(ppData, trainTarget)

RandomForestClassifier(max_depth=20, n_estimators=200, random_state=72)

### Make predictions and save csv

In [39]:
# Make predictions and save csv

testData = pd.read_csv(testDataPath, index_col="PassengerId")
predictionData = PreProcessData(testData)

predictions = model.predict(predictionData)

testData["Survived"] = predictions
testData["Survived"].to_csv(outputPath)