In [6]:
"""
This model aims to provide accurate results for prediction on survivors with the titanic dataset on kaggle
Currently holds an accuracy of 78% with test data

"""

#        Importing pandas to explore dataset
import pandas as pd

#        Loading dataset
data = pd.read_csv("train.csv")

# Loading test data for later use
test_data = pd.read_csv("test.csv")
X_test = test_data[[ "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]]

#        Showing dataset head
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
#        Showing data set columns
data.columns
#        Showing empty values within columns
#        Found 177 ages, 687 Cabin and 2 Embarked values missing
data.isna().sum()

#getting dummies for Sex,Embarked and make them readable for the classifiers
data['Sex'] = pd.get_dummies(data['Sex'])
data['Embarked'] = pd.get_dummies(data['Embarked'])

#same for test_data
X_test['Sex'] = pd.get_dummies(test_data['Sex'])
X_test['Embarked'] = pd.get_dummies(test_data['Embarked'])

#setting cols that we can use vs ones we can't use
num_cols = ["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex","Embarked"]
cat_cols = ["Ticket", "Cabin"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Sex'] = pd.get_dummies(test_data['Sex'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Embarked'] = pd.get_dummies(test_data['Embarked'])


In [8]:
#Assigning data to X,y
X = data[[ "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]]
y = data["Survived"]

#transforming data
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='most_frequent')
X  = imp.fit_transform(X[num_cols])

#X_test too
X_test = imp.fit_transform(X_test[num_cols])


#Split data
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size= 0.8, test_size=0.2, 
                                                      random_state=0)

In [9]:
#Import mean squared error, the metric we're using to calculate error on the classifiers
from sklearn.metrics import mean_squared_error

In [76]:
#importing classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

#this function fits each one of the classifiers we're using, in order to get its MAE, the lower, the better. 
def get_results():
    lr_def  = LogisticRegression(max_iter=1000)
    knn_def = KNeighborsClassifier()
    clf_def = GaussianNB()
    xgb_def = xgb.XGBClassifier(n_estimators=45,max_depth=7,random_state=0,tree_method="exact")
    svc_def = SVC(probability=True)
    rfc_def = RandomForestClassifier(max_depth=11, random_state=0)
    
    classifiers = [lr_def,knn_def, clf_def, xgb_def, svc_def, rfc_def]
    for i in classifiers:
        i.fit(X_train,y_train)
        i.preds = i.predict(X_valid)
        i.MSE = mean_squared_error(y_valid, i.preds)
        print(str(i) + " MSE"+ ": " + str(i.MSE))
        
get_results()
    

LogisticRegression(max_iter=1000) MSE: 0.18994413407821228
KNeighborsClassifier() MSE: 0.2681564245810056
GaussianNB() MSE: 0.2122905027932961
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=45, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None) MSE: 0.12290502793296089
SVC(probability=True) MSE: 0.27932960893854747




RandomForestClassifier(max_depth=11, random_state=0) MSE: 0.13966480446927373


In [16]:
#From this output we can conclude that the best model to use is XGBClassifier
#So we're gonna fit it with the full data. 

xgb_final = xgb.XGBClassifier(max_depth=7,random_state=0)
xgb_final.fit(X,y)
xgb_final_preds = xgb_final.predict(X_test)


#To use only when decided on an algorithm to use, then submit preds to kaggle. 
final_data = {'PassengerId': test_data["PassengerId"], 'Survived': xgb_final_preds}
final_data = pd.DataFrame(data=final_data)
final_data.to_csv('name_submission.csv', index=False)


"""
That should do it for now, this model grants you a place within top 15% on kaggle titanic competition. 
To improve  model accuracy, we have to work on the data that was left out. Name, Ticket and Cabin were left out. 
Next task is to make this data useful. 
More to come. 

"""



