In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import plot_tree
import sklearn.tree
from sklearn.ensemble import RandomForestClassifier
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

plt.style.use('ggplot')
#pd.set_option('max_columns', 200)

In [6]:
df = pd.read_csv('train.csv')
transported = df['Transported'].astype(int)
df.insert(1, 'transport_num', transported)
df.reindex()
df = df.dropna().copy()
df.columns

Index(['PassengerId', 'transport_num', 'HomePlanet', 'CryoSleep', 'Cabin',
       'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Name', 'Transported'],
      dtype='object')

In [7]:
df = df[['PassengerId', 'HomePlanet', 'CryoSleep', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported']].copy()

# one-hot encoding

In [8]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=["HomePlanet", "VIP"], drop_first=False)
df

Unnamed: 0,PassengerId,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,VIP_False,VIP_True
0,0001_01,False,39.0,0.0,0.0,0.0,0.0,0.0,False,0,1,0,1,0
1,0002_01,False,24.0,109.0,9.0,25.0,549.0,44.0,True,1,0,0,1,0
2,0003_01,False,58.0,43.0,3576.0,0.0,6715.0,49.0,False,0,1,0,0,1
3,0003_02,False,33.0,0.0,1283.0,371.0,3329.0,193.0,False,0,1,0,1,0
4,0004_01,False,16.0,303.0,70.0,151.0,565.0,2.0,True,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,False,41.0,0.0,6819.0,0.0,1643.0,74.0,False,0,1,0,0,1
8689,9278_01,True,18.0,0.0,0.0,0.0,0.0,0.0,False,1,0,0,1,0
8690,9279_01,False,26.0,0.0,0.0,1872.0,1.0,0.0,True,1,0,0,1,0
8691,9280_01,False,32.0,0.0,1049.0,0.0,353.0,3235.0,False,0,1,0,1,0


# Split the dataset into train and test

In [9]:
train, test = train_test_split(df, test_size=0.1, random_state=2, shuffle=True)

print("Data shape:")
print("train", train.shape)
print("test", test.shape)

Data shape:
train (5945, 14)
test (661, 14)


# Model assessment

In [10]:
# Make a utility method that we can re-use
# To easily fit and test out model
features = [c for c in df.columns if c != "Transported"]


def fit_and_test_model(model):
    '''
    Trains a model and tests it against both train and test sets
    '''  
    global features

    # Train the model
    model.fit(train[features], train.Transported)

    # Assess its performance
    # -- Train
    predictions = model.predict(train[features])
    train_accuracy = balanced_accuracy_score(train.Transported, predictions)

    # -- Test
    predictions = model.predict(test[features])
    test_accuracy = balanced_accuracy_score(test.Transported, predictions)

    return train_accuracy, test_accuracy

# Fitting a decision tree

In [11]:
# fit a simple tree using only three levels
model = sklearn.tree.DecisionTreeClassifier(random_state=1, max_depth=10) 
dt_train_accuracy, dt_test_accuracy = fit_and_test_model(model)

print("Decision Tree Performance:")
print("Train accuracy", dt_train_accuracy)
print("Test accuracy", dt_test_accuracy)

Decision Tree Performance:
Train accuracy 0.8455005885282267
Test accuracy 0.7851243835817339


# Random Forest

In [12]:
# Create a random forest model with two trees
random_forest = RandomForestClassifier( n_estimators=2,
                                        random_state=2,
                                        verbose=False)

# Train and test the model
train_accuracy, test_accuracy = fit_and_test_model(random_forest)
print("Random Forest Performance:")
print("Train accuracy", train_accuracy)
print("Test accuracy", test_accuracy)

Random Forest Performance:
Train accuracy 0.8997546855901127
Test accuracy 0.7494317036059324


# Altering the number of trees

In [23]:
# n_estimators states how many trees to put in the model
# We will make one model for every entry in this list
# and see how well each model performs 
n_estimators = [2, 5, 10, 20, 50]

# Train our models and report their performance
train_accuracies = []
test_accuracies = []

for n_estimator in n_estimators:
    print("Preparing a model with", n_estimator, "trees...")

    # Prepare the model 
    rf = RandomForestClassifier(n_estimators=n_estimator, 
                                random_state=2, 
                                verbose=False)
    
    # Train and test the result
    train_accuracy, test_accuracy = fit_and_test_model(rf)

    # Save the results
    test_accuracies.append(test_accuracy)
    train_accuracies.append(train_accuracy)

Preparing a model with 2 trees...
Preparing a model with 5 trees...
Preparing a model with 10 trees...
Preparing a model with 20 trees...
Preparing a model with 50 trees...


# Altering the minimum number of samples for split parameter


In [24]:
# Shrink the training set temporarily to explore this
# setting with a more normal sample size
full_trainset = train
#train = full_trainset[:1000] # limit to 1000 samples

min_samples_split = [2, 10, 20, 50, 100, 500]

# Train our models and report their performance
train_accuracies = []
test_accuracies = []

for min_samples in min_samples_split:
    print("Preparing a model with min_samples_split = ", min_samples)

    # Prepare the model 
    rf = RandomForestClassifier(n_estimators=20,
                                min_samples_split=min_samples,
                                random_state=2, 
                                verbose=False)
    
    # Train and test the result
    train_accuracy, test_accuracy = fit_and_test_model(rf)

    # Save the results
    test_accuracies.append(test_accuracy)
    train_accuracies.append(train_accuracy)


# Rol back the trainset to the full set
train = full_trainset

Preparing a model with min_samples_split =  2
Preparing a model with min_samples_split =  10
Preparing a model with min_samples_split =  20
Preparing a model with min_samples_split =  50
Preparing a model with min_samples_split =  100
Preparing a model with min_samples_split =  500


# Altering the model depth

In [25]:
# Shrink the training set temporarily to explore this
# setting with a more normal sample size
full_trainset = train
#train = full_trainset[:500] # limit to 500 samples

max_depths = [2, 4, 6, 8, 10, 15, 20, 50, 100]

# Train our models and report their performance
train_accuracies = []
test_accuracies = []

for max_depth in max_depths:
    print("Preparing a model with max_depth = ", max_depth)

    # Prepare the model 
    rf = RandomForestClassifier(n_estimators=20,
                                max_depth=max_depth,
                                random_state=2, 
                                verbose=False)
    
    # Train and test the result
    train_accuracy, test_accuracy = fit_and_test_model(rf)

    # Save the results
    test_accuracies.append(test_accuracy)
    train_accuracies.append(train_accuracy)

# Rol back the trainset to the full set
train = full_trainset

Preparing a model with max_depth =  2
Preparing a model with max_depth =  4
Preparing a model with max_depth =  6
Preparing a model with max_depth =  8
Preparing a model with max_depth =  10
Preparing a model with max_depth =  15
Preparing a model with max_depth =  20
Preparing a model with max_depth =  50
Preparing a model with max_depth =  100


# An optimised model

In [26]:
# Prepare the model 
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=128,
                            #max_features=25,
                            min_samples_split=2,
                            random_state=2, 
                            verbose=False)

# Train and test the result
print("Training model. This may take 1 - 2 minutes")
train_accuracy, test_accuracy = fit_and_test_model(rf)

# Print out results, compared to the decision tree
data = {"Model": ["Decision tree","Final random forest"],
        "Train sensitivity": [dt_train_accuracy, train_accuracy],
        "Test sensitivity": [dt_test_accuracy, test_accuracy]
        }

pd.DataFrame(data, columns = ["Model", "Train sensitivity", "Test sensitivity"])

Training model. This may take 1 - 2 minutes


Unnamed: 0,Model,Train sensitivity,Test sensitivity
0,Decision tree,0.845501,0.785124
1,Final random forest,1.0,0.795885


In [28]:
# saving predictions with their respective Ids from the test set
predictions = pd.DataFrame(data = { 'PassengerId' : test['PassengerId'],
                                    'Transported' : test['Transported']               } )

predictions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 661 entries, 5968 to 7152
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  661 non-null    object
 1   Transported  661 non-null    bool  
dtypes: bool(1), object(1)
memory usage: 11.0+ KB


In [20]:
# sending predictions to .csv file
predictions.to_csv(path_or_buf = './submission.csv',
                   index = False)