In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/ML_final'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/content/drive/MyDrive/ML_final/train.csv
/content/drive/MyDrive/ML_final/test.csv
/content/drive/MyDrive/ML_final/sample_submission.csv


In [2]:
# modeling library
import sklearn.linear_model                          # linear modeling in scikit-learn

# other model building tools
from sklearn.model_selection import train_test_split # train-test split
from sklearn.metrics import roc_auc_score            # auc score

In [3]:
path             = "/content/drive/MyDrive/ML_final/"
training_dataset = "train.csv"


# reading in the .csv file with pandas
titanic_train    = pd.read_csv(filepath_or_buffer = path + training_dataset)


# checking basic info about the dataset
titanic_train.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
path             = "/content/drive/MyDrive/ML_final/"
testing_dataset  = 'test.csv'

# importing the testing dataset
titanic_test = pd.read_csv(filepath_or_buffer = path + testing_dataset)

# checking basic info about the dataset
titanic_test.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [5]:
titanic_train['set'] = 'Training'
titanic_test ['set'] = 'Testing'

# concatenating both datasets together for mv and feature engineering
titanic_df = titanic_train.append(other = titanic_test)

# resetting index to avoid problems later in the code
titanic_df.reset_index(drop = False,
                       inplace = True)

  titanic_df = titanic_train.append(other = titanic_test)


In [53]:
# instantiating a correlation matrix
titanic_corr = titanic_train.corr(method = 'pearson').round(decimals = 4)

# transforming correlations to absolute values
titanic_corr.loc[ : , 'Transported' ].apply(func = abs).sort_values(ascending = False)

  titanic_corr = titanic_train.corr(method = 'pearson').round(decimals = 4)


Transported     1.0000
RoomService     0.2446
Spa             0.2211
VRDeck          0.2071
Age             0.0750
FoodCourt       0.0466
ShoppingMall    0.0101
Name: Transported, dtype: float64

In [275]:
# imputing in missing values for RoomService
#titanic_df[ 'VRDeck'].fillna(value = 1, inplace = True)
titanic_df['CryoSleep'].replace({'False': 0, 'True': 1}).astype('float64')
titanic_df['CryoSleep'].fillna(value = 0, inplace = True)
#titanic_df['Age'].fillna(0, inplace=True)
#titanic_df['HomePlanet'].replace({'Europa': 1, 'Earth': 0, 'Mars': 0}, inplace=True)
#titanic_df['HomePlanet'].fillna(1, inplace=True)
#titanic_df['Destination'].replace({'TRAPPIST-1e': 1, 'PSO J318.5-22': 0, '55 Cancri e': 0}, inplace=True)
#titanic_df['Destination'].fillna(1, inplace=True)
titanic_df['VIP'].replace({'False': 0, 'True': 1}).astype('float64')
titanic_df['VIP'].fillna(0, inplace=True)
titanic_df[ 'RoomService' ].fillna(value = 0, inplace = True)
titanic_df[ 'Spa' ].fillna(value = 0, inplace = True)
titanic_df[ 'VRDeck'].fillna(value = 0, inplace = True)
titanic_df[ 'Age'].fillna(0, inplace=True)
titanic_df[ 'FoodCourt'].fillna(0, inplace=True)
titanic_df['ShoppingMall'].fillna(0, inplace=True)
weights = np.array([0.2446, 0.2211, 0.2071, 0.0750, 0.0466, 0.0101])
explanatory_vars = ['RoomService', 'Spa', 'VRDeck', 'Age', 'FoodCourt', 'ShoppingMall']
# setting explanatory variable(s) with most correlated x-variable
x_train = titanic_df[ explanatory_vars][ titanic_df['set'] == 'Training' ]

# setting response variable
y_train = titanic_df[ 'Transported' ][ titanic_df['set']   == 'Training' ]

In [278]:
# developing training and validation sets
x_train_1, x_train_2, y_train_1, y_train_2 = train_test_split(
            x_train,
            y_train.astype(dtype = 'float64'),
            random_state = 50,
            test_size    = 0.2,
            stratify     = y_train)
#0.4101

In [279]:
# picking a model name
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

weights = np.tile(weights, int(np.ceil(len(x_train_1) / len(weights))))[:len(x_train_1)]
# INSTANTIATING a model object - CHANGE THIS AS NEEDED
model = sklearn.linear_model.LogisticRegression(max_iter=1000, C=0.1, solver='saga', class_weight='balanced', multi_class='ovr', tol=1e-15)


# FITTING to the training data
model_fit = model.fit(x_train_1, y_train_1, sample_weight=weights)


# PREDICTING on the response variable
model_train_pred = model_fit.predict(x_train_1)
model_valid_pred = model_fit.predict(x_train_2)


# SCORING the results (accuracy)
model_train_score = model.score(x_train_1, y_train_1).round(6) # training accuracy
model_valid_score = model.score(x_train_2, y_train_2).round(6) # validation accuracy

# SCORING the results (auc)
model_train_auc = roc_auc_score(y_true  = y_train_1,
                                y_score = model_train_pred).round(decimals = 4)

model_valid_auc = roc_auc_score(y_true  = y_train_2,
                                y_score = model_valid_pred).round(decimals = 4)

# displaying results
print('Training Accuracy:  ', model_train_score)
print('Validation Accuracy:', model_valid_score)
print('Training AUC:       ', model_train_auc)
print('Validation AUC:     ', model_valid_auc)

Training Accuracy:   0.787029
Validation Accuracy: 0.80046
Training AUC:        0.7864
Validation AUC:      0.7999




In [280]:
# setting x_test
x_test  = titanic_df[ explanatory_vars ][ titanic_df['set'] == 'Testing' ]

# PREDICTING on new data
model_pred = model.predict(x_test)

# checking results
print(model_pred)

[1. 0. 1. ... 1. 1. 1.]


In [281]:
# saving predictions with their respective Ids from the test set
predictions = pd.DataFrame(data = { 'PassengerId' : titanic_test['PassengerId'],
                                    'Transported' : model_pred.astype(bool)               } )

# checking the results
predictions.head(n = 5)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [282]:
predictions.to_csv(path_or_buf = '311512048,311512062_Sample.csv',
                   index = False)