In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [21]:
train = pd.read_csv('../input/titanic/train.csv');
test = pd.read_csv('../input/titanic/test.csv');
gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')
test['Survived'] = gender_submission['Survived'].values

In [22]:
#pick the libraries I want to use

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier 


from sklearn import metrics


In [23]:
#preprocessing
def fillna(df):
    df.Age = df.Age.fillna(df['Age'].median())#fill with median age
    df.Embarked = df.Embarked.fillna('N')
    df.Cabin = df.Cabin.fillna('no class')
    df.Fare = df.Fare.fillna(0.0)
    return df
    
#label encoder: 2 columns

def labels_encoder(df): #in place encoding
    from sklearn import preprocessing
    lst = ['Sex','Embarked','Cabin_Class']#columns to encode
    for i in lst:
        le = preprocessing.LabelEncoder()#create object
        col = set(df[i].values)#unique values in column
        le.fit(list(col))
        l = le.transform(df[i].values)
        df[i] = l
    return df

#create new feature for classification - cluster
#cluster will be based on location on board
def add_clustering(df):
    featues_list = ['Pclass','SibSp','Parch','Cabin_Class']
    k_est_2 = KMeans(n_clusters=2)#2 clusters - died/ survived
    k_est_2.fit(df[featues_list]) 
    df['Cluster'] = k_est_2.labels_
    return df
    
#get final df for classification model
def preprocessing(df):
    #Create new feature - cabin class
    l = list(df['Cabin'].values)
    for index in range(0,len(l)):
        if l[index] != "no class":
            l[index] = str(l[index])[0]
        else:
            l[index] = l[index]
    df['Cabin_Class'] = l
    #activate inner functions:
    df = fillna(df)
    df = labels_encoder(df)
    #get final df
    features = ['PassengerId','Survived','Pclass','Age','SibSp','Parch','Fare','Sex','Cabin_Class','Embarked']
    df = df[features]
    df = add_clustering(df)
    return df

In [24]:
train = preprocessing(train)
train.head(3)


In [25]:
test.head(3)
#R.C - sould be regular df (not modified like the train is at this point)

In [26]:
#check ideal neighbors number for model
k_values = []
accuracy_values = []
f1_values = []
#first, test train split
train_df, test_df = train_test_split(train, test_size=0.2)

#the determine max number of neighbors
max_neighbors=int(train.shape[0]*0.01) #max_neighbors would be 1% of the total number of rows in the set
features = ['Pclass','Age','SibSp','Parch','Fare','Sex','Cabin_Class','Embarked','Cluster']

for k in range(1,max_neighbors + 1,2): 
    #model creation
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(train_df[features], list(train_df["Survived"].values))  
    #test
    actual_vals_list= test_df["Survived"].values
    predict_vals_list = knn_classifier.predict(test_df[features])

    accuracy = metrics.accuracy_score(actual_vals_list,predict_vals_list)
    f1 = metrics.f1_score(actual_vals_list,predict_vals_list, average='macro') #for each category individually

    #creating series for plots:
    k_values.append(k)
    accuracy_values.append(accuracy)
    f1_values.append(f1)
#and plotting the predictions:
y = accuracy_values
z = f1_values
ki = list(range(len(k_values)))
plt.figure(figsize=(10,5))
plt.plot(ki, y, label='accuracy')
plt.plot(ki, z, label='F-measure')

plt.xticks(ki, k_values)
plt.xlabel("k - value")
plt.ylabel("accurecy test parameter value")
plt.title("prediction models accuracy values")

plt.legend()


In [27]:
#create KNN model
def KNN_MODEL_creation(df):
    features = ['Pclass','Age','SibSp','Parch','Fare','Sex','Cabin_Class','Embarked','Cluster']
    knn_classifier = KNeighborsClassifier(n_neighbors=3)
    knn_classifier.fit(df[features], list(df["Survived"].values))
    return knn_classifier
knn_classifier = KNN_MODEL_creation(train)

In [28]:
#add prediction to given df using the trained model
def KNN_predict(df,model):
    features1 = ['Pclass','Age','SibSp','Parch','Fare','Sex','Cabin_Class','Embarked','Cluster']
    df = preprocessing(df)
    predict_vals_list = model.predict(df[features1])
    df['KNN-pred'] = predict_vals_list
    return df

In [29]:
test.head()
#R.C - still regular?

In [30]:
test = KNN_predict(test,knn_classifier)
test.head()

In [31]:
def evaluate_model(df,col):#runs on test file with target column + prediction column. insert pred column to function
    actual_vals= test["Survived"].values
    predict_vals = test[col].values
    accuracy = metrics.accuracy_score(actual_vals,predict_vals)
    f1 = metrics.f1_score(actual_vals,predict_vals, average='macro') #for each category individually
    print(f'accuracy score: {accuracy}')
    print(f'f_1 score: {f1}')
    cm = metrics.confusion_matrix(actual_vals,predict_vals)
    df_cm = pd.DataFrame(cm, columns=list(set(df['Survived'].values)), index = list(set(df['Survived'].values)))
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (10,7))
    sns.set(font_scale=1.4)#for label size
    sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 8})# font size
    

In [32]:
evaluate_model(test,'KNN-pred')

In [35]:
#create decision tree model
def decision_tree_create(df):
    tree_clf = DecisionTreeClassifier()
    featues_list = ['Pclass','Age','SibSp','Parch','Fare','Sex','Cabin_Class','Embarked','Cluster']
    tree_clf.fit(df[featues_list], df["Survived"].values)
    return tree_clf
    

In [36]:
tree_clf = decision_tree_create(train)

In [53]:
def print_tree(tree_clf):
    from graphviz import Source
    from sklearn import tree
    featues_list = ['Pclass','Age','SibSp','Parch','Fare','Sex','Cabin_Class','Embarked','Cluster']
    return Source( tree.export_graphviz(tree_clf, 
                         out_file=None,feature_names=featues_list, 
                         class_names = ['died','survived'],filled = True,rounded = True))
    

In [54]:
print_tree(tree_clf)

In [43]:
#add prediction to given df using the trained model
def decision_tree_predict(df,model):
    features1 = ['Pclass','Age','SibSp','Parch','Fare','Sex','Cabin_Class','Embarked','Cluster']
    #df = preprocessing(df) - not needed, test df is already preprocessed from KNN model
    predict_vals_list = model.predict(df[features1])
    df['tree-pred'] = predict_vals_list
    return df

In [44]:
test = decision_tree_predict(test,tree_clf)
test.head()

In [45]:
evaluate_model(test,'tree-pred')