In [1]:
# imports
import pandas as pd
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import scipy.stats as sst
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# Notebook variables
SEED_VALUE = 72
TRAIN_SIZE = 0.8
TEST_SIZE = 0.2

DEPTH = 5
ESTIMATORS = 25

trainDataPath = "../input/train.csv"
testDataPath = "../input/test.csv"

outputPath = "../output/submission-01.csv"


In [3]:
# Functions

# Function for comparing different approaches
def score_gradientboosting_model(X_train, X_valid, y_train, y_valid, n_estimators=200, max_depth=20):
    model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=SEED_VALUE, max_depth=max_depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds), accuracy_score(y_valid, preds)

def score_gradientboosting(X_train, X_valid, y_train, y_valid, params):
    scores = []

    for e in params["n_estimators"]:
      for d in params["depth"]:
        mae, accuracy = score_gradientboosting_model(X_train, X_valid, y_train, y_valid, n_estimators=e, max_depth=d)

        scores.append(
          {
            "type": "GradientBoostingClassifier",
            "mae": mae,
            "accuracy": accuracy,
            "estimators": e,
            "depth": d
          }
        )

    return scores


def score_randomforest_model(X_train, X_valid, y_train, y_valid, n_estimators=200, max_depth=20):
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=SEED_VALUE, max_depth=max_depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds), accuracy_score(y_valid, preds)

def score_randomforest(X_train, X_valid, y_train, y_valid, params):
    scores = []

    for e in params["n_estimators"]:
      for d in params["depth"]:
        mae, accuracy = score_randomforest_model(X_train, X_valid, y_train, y_valid, n_estimators=e, max_depth=d)

        scores.append(
          {
            "type": "RandomForestClassifier",
            "mae": mae,
            "accuracy": accuracy,
            "estimators": e,
            "depth": d
          }
        )

    return scores

def score_adaboost_model(X_train, X_valid, y_train, y_valid):
    model = AdaBoostClassifier()
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds), accuracy_score(y_valid, preds)

def score_adaboost(X_train, X_valid, y_train, y_valid, params):
    scores = []

    mae, accuracy = score_adaboost_model(X_train, X_valid, y_train, y_valid)

    scores.append(
      {
        "type": "AdaBoostClassifier",
        "mae": mae,
        "accuracy": accuracy
      }
    )

    return scores

def score_decisiontree_model(X_train, X_valid, y_train, y_valid, max_depth=20):
    model = DecisionTreeClassifier(random_state=SEED_VALUE, max_depth=max_depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds), accuracy_score(y_valid, preds)

def score_decisiontree(X_train, X_valid, y_train, y_valid, params):
    scores = []

    for d in params["depth"]:
      mae, accuracy = score_decisiontree_model(X_train, X_valid, y_train, y_valid, max_depth=d)

      scores.append(
        {
          "type": "DecisionTreeClassifier",
          "mae": mae,
          "accuracy": accuracy,
          "depth": d
        }
      )

    return scores

def score_extratrees_model(X_train, X_valid, y_train, y_valid, n_estimators=200, max_depth=20):
    model = ExtraTreesClassifier(n_estimators=n_estimators, random_state=SEED_VALUE, max_depth=max_depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds), accuracy_score(y_valid, preds)

def score_extratrees(X_train, X_valid, y_train, y_valid, params):
    scores = []

    for e in params["n_estimators"]:
      for d in params["depth"]:
        mae, accuracy = score_extratrees_model(X_train, X_valid, y_train, y_valid, n_estimators=e, max_depth=d)

        scores.append(
          {
            "type": "ExtraTreesClassifier",
            "mae": mae,
            "accuracy": accuracy,
            "estimators": e,
            "depth": d
          }
        )

    return scores

def score_lsvc_model(X_train, X_valid, y_train, y_valid):
    model = LinearSVC(max_iter=10000)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds), accuracy_score(y_valid, preds)

def score_lsvc(X_train, X_valid, y_train, y_valid, params):
    scores = []

    mae, accuracy = score_lsvc_model(X_train, X_valid, y_train, y_valid)

    scores.append(
      {
        "type": "LinearSVC",
        "mae": mae,
        "accuracy": accuracy
      }
    )

    return scores

def create_homeplanet_bins(data):
    binData = data.copy()

    binData.loc[binData["HomePlanet"].isna(), "HomePlanet"] = "Unknown"

    binData["f_HomePlanet"] = binData["HomePlanet"]

    binData = pd.get_dummies(data = binData, columns = ["f_HomePlanet"], prefix = ["f_HomePlanet"])

    return binData

def create_idgroup_bins(data):
    binData = data.copy()

    binData["f_GroupId"] = binData.apply(lambda row: row.name.split("_")[0], axis=1)
    binData["f_GroupCount"] = binData.groupby(["f_GroupId"])["f_GroupId"].transform("count")
    
    binData["f_GroupAlone"] = 1
    binData.loc[binData["f_GroupCount"] > 1, "f_GroupAlone"] = 0

    binData["f_LargeGroup"] = 0
    binData.loc[binData["f_GroupCount"] > 3, "f_LargeGroup"] = 1

    return binData

def create_cabin_features(data):
    binData = data.copy()

    binData["Cabin"] = binData["Cabin"].fillna("U/U/U")
    binData["f_DeckTemp"] = binData.apply(lambda row: row["Cabin"].split("/")[0], axis=1)
    binData["f_CabinNumber"] = binData.apply(lambda row: row["Cabin"].split("/")[1], axis=1)
    binData["f_SideTemp"] = binData.apply(lambda row: row["Cabin"].split("/")[2], axis=1)

    binData["f_Deck"] = binData["f_DeckTemp"]
    binData = pd.get_dummies(data = binData, columns = ["f_DeckTemp"], prefix = ["f_Deck"])
    
    binData["f_Side"] = binData["f_SideTemp"]
    binData = pd.get_dummies(data = binData, columns = ["f_SideTemp"], prefix = ["f_Side"])

    return binData

def create_destination_bins(data):
    binData = data.copy()

    binData.loc[binData["Destination"].isna(), "Destination"] = "Unknown"
    binData["f_HomeDest"] = binData["HomePlanet"] + "_" + binData["Destination"]

    return binData

def create_age_bins(data):
    binData = data.copy()

    binData.loc[binData["Age"] >= 0, "f_Age"] = 0
    binData.loc[binData["Age"] > 15, "f_Age"] = 1
    binData.loc[binData["Age"] > 30, "f_Age"] = 2
    binData.loc[binData["Age"] > 45, "f_Age"] = 3
    binData.loc[binData["Age"] > 60, "f_Age"] = 4
    binData.loc[binData["Age"] > 75, "f_Age"] = 5

    binData["f_Young"] = 0
    binData.loc[binData["Age"] < 15, "f_Young"] = 1

    return binData

def create_vip_bins(data):
    binData = data.copy()

    binData["f_vip"] = 0

    binData.loc[binData["VIP"].isna(), "f_vip"] = 2
    binData.loc[(binData["VIP"] == True), "f_vip"] = 1

    return binData

def create_spend_features(data):
    binData = data.copy()

    spendFeatures = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    outsideSpendFeatures = ["FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

    #binData["f_totalspend"] = binData[spendFeatures].sum(axis=1)
    binData["f_outsidespend"] = binData[outsideSpendFeatures].sum(axis=1)

    binData.loc[binData[spendFeatures].sum(axis=1) == 0, "f_totalspend"] = 0
    binData.loc[binData[spendFeatures].sum(axis=1) > 0, "f_totalspend"] = 1
    binData.loc[binData[spendFeatures].sum(axis=1) > 750, "f_totalspend"] = 2
    binData["f_totalspend"] = binData["f_totalspend"].fillna(3)

    binData.loc[binData["RoomService"] == 0, "f_RoomService"] = 0
    binData.loc[binData["RoomService"] > 0, "f_RoomService"] = 1
    binData.loc[binData["RoomService"] > 100, "f_RoomService"] = 2
    binData["f_RoomService"] = binData["f_RoomService"].fillna(3)

    binData.loc[binData["FoodCourt"] == 0, "f_FoodCourt"] = 0
    binData.loc[binData["FoodCourt"] > 0, "f_FoodCourt"] = 1
    binData.loc[binData["FoodCourt"] > 100, "f_FoodCourt"] = 2
    binData["f_FoodCourt"] = binData["f_FoodCourt"].fillna(3)

    binData.loc[binData["ShoppingMall"] == 0, "f_ShoppingMall"] = 0
    binData.loc[binData["ShoppingMall"] > 0, "f_ShoppingMall"] = 1
    binData.loc[binData["ShoppingMall"] > 100, "f_ShoppingMall"] = 2
    binData["f_ShoppingMall"] = binData["f_ShoppingMall"].fillna(3)

    binData.loc[binData["Spa"] == 0, "f_Spa"] = 0
    binData.loc[binData["Spa"] > 0, "f_Spa"] = 1
    binData.loc[binData["Spa"] > 100, "f_Spa"] = 2
    binData["f_Spa"] = binData["f_Spa"].fillna(3)

    binData.loc[binData["VRDeck"] == 0, "f_VRDeck"] = 0
    binData.loc[binData["VRDeck"] > 0, "f_VRDeck"] = 1
    binData.loc[binData["VRDeck"] > 100, "f_VRDeck"] = 2
    binData["f_VRDeck"] = binData["f_VRDeck"].fillna(3)

    binData["f_ZeroSpend"] = 0
    binData.loc[binData["f_totalspend"] == 0, "f_ZeroSpend"] = 1

    return binData

def create_cryo_bins(data):
    binData = data.copy()

    binData["f_Cryo"] = 0

    binData.loc[binData["CryoSleep"].isna(), "f_Cryo"] = 2
    binData.loc[(binData["CryoSleep"] == True), "f_Cryo"] = 1

    # We assume that anyone that was unknown and didn't spend any money was in cryo
    # We assume that anyone that was unknown and did spend money was not in cryo
    binData.loc[(binData["f_Cryo"] == 2) & (binData["f_totalspend"] == 0), "f_Cryo"] = 1
    binData.loc[(binData["f_Cryo"] == 2) & (binData["f_totalspend"] > 0), "f_Cryo"] = 0

    return binData

def create_features(data):
  featureData = data.copy()

  featureData = create_homeplanet_bins(featureData)
  featureData = create_idgroup_bins(featureData)
  featureData = create_cabin_features(featureData)
  featureData = create_destination_bins(featureData)
  featureData = create_age_bins(featureData)
  featureData = create_vip_bins(featureData)
  featureData = create_spend_features(featureData)
  featureData = create_cryo_bins(featureData)

  return featureData

def score_models(X_train, X_valid, y_train, y_valid):
  randomForestParams = {
    "n_estimators": [5, 10, 25, 50, 100],
    "depth": [1, 5, 10, 20]
  }

  gradientBoostingParams = {
    "n_estimators": [5, 10, 25, 50, 100],
    "depth": [1, 5, 10, 20]
  }

  decisionTreeParams = {
    "depth": [1, 5, 10, 20, 50, 100]
  }

  extraTreesParams = {
    "n_estimators": [5, 10, 25, 50, 100, 200, 500, 100],
    "depth": [1, 5, 10, 20, 50, 100]
  }

  linearSVCParams = {}

  scores = []
  scores = scores + (score_gradientboosting(X_train, X_valid, y_train, y_valid, gradientBoostingParams))
  scores = scores + (score_randomforest(X_train, X_valid, y_train, y_valid, randomForestParams))
  scores = scores + (score_decisiontree(X_train, X_valid, y_train, y_valid, decisionTreeParams))
  scores = scores + (score_extratrees(X_train, X_valid, y_train, y_valid, extraTreesParams))
  scores = scores + (score_lsvc(X_train, X_valid, y_train, y_valid, linearSVCParams))
  scores = scores + (score_adaboost(X_train, X_valid, y_train, y_valid, {}))

  return scores


In [4]:
# Load initial data

trainData = pd.read_csv(trainDataPath, index_col="PassengerId")
trainTarget = trainData.pop("Transported")


### Pre-process training data

We will remove:

- passengerId - not required
- age (we will initially drop this and look at adding it back in later with some missing data estimates)
- ticket
- cabin

We will convert:

- sex - ordinal encoding


In [5]:
# Preprocessing

ppData = create_features(trainData)

ppData.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,f_vip,f_outsidespend,f_totalspend,f_RoomService,f_FoodCourt,f_ShoppingMall,f_Spa,f_VRDeck,f_ZeroSpend,f_Cryo
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,0,627.0,1.0,2.0,1.0,1.0,2.0,1.0,0,0
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,1,10340.0,2.0,1.0,2.0,0.0,2.0,1.0,0,0
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,0,5176.0,2.0,0.0,2.0,2.0,2.0,2.0,0,0
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,0,788.0,2.0,2.0,1.0,2.0,2.0,1.0,0,0


### Creating model

Our first attempt at creating a model will use `Sex`, `Fare` and `Pclass`.

In [6]:
# Train model

features = ["f_Cryo", "f_Deck_B", "f_Deck_E", "f_Deck_C", "f_Side_S", "f_ZeroSpend", "f_Young", "f_totalspend"]
model = GradientBoostingClassifier(n_estimators=ESTIMATORS, random_state=SEED_VALUE, max_depth=DEPTH)
model.fit(ppData[features], trainTarget)

GradientBoostingClassifier(max_depth=5, n_estimators=25, random_state=72)

### Make predictions and save csv

In [7]:
# Make predictions and save csv

testData = pd.read_csv(testDataPath, index_col="PassengerId")
predictionData = create_features(testData)

predictions = model.predict(predictionData[features])

testData["Transported"] = predictions
testData["Transported"].to_csv(outputPath)

ValueError: Index Transported invalid