In [10]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [11]:
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [13]:
train_df.head(8)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


In [14]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0


In [15]:
#drop passengerId

train_df = train_df.drop(['PassengerId'], axis=1)

In [16]:
import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [train_df, test_df]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)
# we can now drop the cabin feature
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

In [17]:
data = [train_df, test_df]

for dataset in data:
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)
train_df["Age"].isnull().sum()

0

In [18]:
common_value = 'S'
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [19]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         891 non-null int32
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Embarked    891 non-null object
Deck        891 non-null int32
dtypes: float64(1), int32(2), int64(4), object(4)
memory usage: 69.7+ KB


In [20]:
data = [train_df, test_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [21]:
data = [train_df, test_df]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

In [22]:
genders = {"male": 0, "female": 1}
data = [train_df, test_df]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [23]:
train_df['Ticket'].describe()

count        891
unique       681
top       347082
freq           7
Name: Ticket, dtype: object

In [24]:
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [25]:
ports = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [26]:
data = [train_df, test_df]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# let's see how it's distributed train_df['Age'].value_counts()

In [27]:
train_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,0,3,0,2,1,0,7,0,8,1
1,1,1,1,5,1,0,71,1,3,3
2,1,3,1,3,0,0,7,0,8,2
3,1,1,1,5,1,0,53,0,3,3
4,0,3,0,5,0,0,8,0,8,1
5,0,3,0,5,0,0,8,2,8,1
6,0,1,0,6,0,0,51,0,5,1
7,0,3,0,0,3,1,21,0,8,4
8,1,3,1,3,0,2,11,0,8,3
9,1,2,1,1,1,0,30,1,8,3


In [28]:
data = [train_df, test_df]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

In [29]:
data = [train_df, test_df]
for dataset in data:
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']

In [31]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()

In [43]:
train_X, test_X, train_y, test_y = train_test_split(
    X_train, Y_train, test_size=0.20,random_state=0)

In [42]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
#Naive Bayes

In [44]:
gaussian2 = GaussianNB() 
gaussian2.fit(train_X, train_y)
predictions = gaussian2.predict(test_X)

accuracy2 = accuracy_score(test_y, predictions)
print(accuracy2)

0.7932960893854749


In [None]:
#Decision Tree

In [45]:
dt2 = DecisionTreeClassifier() 
dt2.fit(train_X, train_y)
predictions2 = dt2.predict(test_X)

accuracy3 = accuracy_score(test_y, predictions2)
print(accuracy3)

0.7988826815642458


In [None]:
#try for holdout

In [46]:
gaussian = GaussianNB() 
gaussian.fit(X_train, Y_train)  
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

In [47]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

In [48]:
results = pd.DataFrame({
    'Model': [ 'Naive Bayes', 
              'Decision Tree'],
    'Score': [ acc_gaussian,acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
92.82,Decision Tree
79.24,Naive Bayes


In [None]:
#cross validation - decision tree (Gini index)

In [49]:
from sklearn.model_selection import cross_val_score
dtc = DecisionTreeClassifier()
scores = cross_val_score(dtc, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.78888889 0.81111111 0.74157303 0.79775281 0.82022472 0.7752809
 0.76404494 0.80898876 0.82022472 0.82954545]
Mean: 0.7957635342185905
Standard Deviation: 0.02676155400693474


In [50]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

predictions_dtc = cross_val_predict(dtc, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions_dtc)

array([[485,  64],
       [115, 227]], dtype=int64)

In [51]:
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(Y_train, predictions_dtc))
print("Recall:",recall_score(Y_train, predictions_dtc))

Precision: 0.7800687285223368
Recall: 0.6637426900584795


In [52]:
from sklearn.metrics import f1_score
f1_score(Y_train, predictions_dtc)

0.717219589257504

In [84]:
#cross validation - decision tree (entropy)

In [85]:
dtc_entropy = DecisionTreeClassifier()
scores_entropy = cross_val_score(dtc_entropy, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores_entropy)
print("Mean:", scores_entropy.mean())
print("Standard Deviation:", scores_entropy.std())

predictions_dtc_entropy = cross_val_predict(dtc_entropy, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions_dtc_entropy)
print("Precision:", precision_score(Y_train, predictions_dtc_entropy))
print("Recall:",recall_score(Y_train, predictions_dtc_entropy))
print("F1 Score: ",f1_score(Y_train, predictions_dtc_entropy))

Scores: [0.76666667 0.82222222 0.71910112 0.79775281 0.82022472 0.78651685
 0.76404494 0.80898876 0.80898876 0.81818182]
Mean: 0.7912688684598798
Standard Deviation: 0.03125390309839749
Precision: 0.7889273356401384
Recall: 0.6666666666666666
F1 Score:  0.722662440570523


In [None]:
#cross validation Naive bayes

In [53]:
from sklearn.model_selection import cross_val_score
nbc = GaussianNB()
scores1 = cross_val_score(nbc, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores1)
print("Mean:", scores1.mean())
print("Standard Deviation:", scores1.std())

Scores: [0.74444444 0.76666667 0.76404494 0.78651685 0.82022472 0.76404494
 0.79775281 0.79775281 0.83146067 0.79545455]
Mean: 0.7868363409374645
Standard Deviation: 0.025810101115194786


In [54]:

predictions_nbc = cross_val_predict(nbc, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions_nbc)

array([[425, 124],
       [ 68, 274]], dtype=int64)

In [55]:
print("Precision:", precision_score(Y_train, predictions_nbc))
print("Recall:",recall_score(Y_train, predictions_nbc))

Precision: 0.6884422110552764
Recall: 0.8011695906432749


In [56]:
f1_score(Y_train, predictions_nbc)

0.7405405405405406

In [None]:
#get output files

In [57]:
gaussian = GaussianNB() 
gaussian.fit(X_train, Y_train)  
Y_pred_n = gaussian.predict(X_test)

In [59]:
holdout_ids = test_df["PassengerId"]
submission_df = {"PassengerId": holdout_ids,
                 "Survived": Y_pred_n}
submission = pd.DataFrame(submission_df)
submission.to_csv("submission_naive_bayes.csv",index=False)

In [82]:
decision = DecisionTreeClassifier(criterion = "entropy")
decision.fit(X_train, Y_train)  
Y_pred_d = decision.predict(X_test)

In [83]:
holdout_ids = test_df["PassengerId"]
submission_df = {"PassengerId": holdout_ids,
                 "Survived": Y_pred_d}
submission = pd.DataFrame(submission_df)
submission.to_csv("submission_decision_tree_entropy.csv",index=False)

In [None]:

#ID3 decision tree classifier

In [62]:
dt3 = DecisionTreeClassifier(criterion = "entropy") 
dt3.fit(train_X, train_y)
predictions3 = dt3.predict(test_X)

accuracy4 = accuracy_score(test_y, predictions3)
print(accuracy4)


0.8044692737430168


In [None]:
#dummy classifier cross validation

In [69]:
from sklearn.dummy import DummyClassifier

In [71]:
dummy = DummyClassifier(strategy = "most_frequent", random_state = 0) 
dummy.fit(train_X, train_y) 
predictionsdummy = dummy.predict(test_X)

accuracy5 = accuracy_score(test_y, predictionsdummy)
print(accuracy5)

0.6145251396648045


In [76]:

dummy1 = DummyClassifier(strategy = "most_frequent", random_state = 0) 
scores5 = cross_val_score(nbc, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores5)
print("Mean:", scores5.mean())
print("Standard Deviation:", scores5.std())

Scores: [0.74444444 0.76666667 0.76404494 0.78651685 0.82022472 0.76404494
 0.79775281 0.79775281 0.83146067 0.79545455]
Mean: 0.7868363409374645
Standard Deviation: 0.025810101115194786


In [77]:
predictions_dummy = cross_val_predict(dummy1, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions_dummy)


array([[549,   0],
       [342,   0]], dtype=int64)

In [78]:
print("Precision:", precision_score(Y_train, predictions_dummy))
print("Recall:",recall_score(Y_train, predictions_dummy))

Precision: 0.0
Recall: 0.0


  'precision', 'predicted', average, warn_for)


In [79]:
f1_score(Y_train, predictions_dummy)

  'precision', 'predicted', average, warn_for)


0.0

In [80]:
dummyclassifier = DummyClassifier(strategy = "most_frequent", random_state = 0)
dummyclassifier.fit(X_train, Y_train)  
Y_pred_dummy = dummyclassifier.predict(X_test)

In [81]:
holdout_ids = test_df["PassengerId"]
submission_df = {"PassengerId": holdout_ids,
                 "Survived": Y_pred_dummy}
submission = pd.DataFrame(submission_df)
submission.to_csv("submission_rule.csv",index=False)