In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [58]:
df = pd.read_csv("train.csv") # labels
dft = pd.read_csv('test.csv') # data file w/o survived feature / labels

On April 15, 1912, the largest passenger liner ever made collided with an iceberg during her
maiden voyage. When the Titanic sank it killed 1502 out of 2224 passengers and crew. 

The titanic.csv file contains data for some of the real Titanic passengers. 
Each row represents one person. The columns describe different attributes about the person.


In [59]:
#  Features of Data Set: Class of the room, Name, Sex, Age, number of siblings/spouses, 
# number of parents/children, ticket No, passenger fare, cabin No and port of embarkation.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [60]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [61]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [62]:
df.Embarked.value_counts() # Southampton, Cherbourg, and Queenstown.

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [65]:
# some data cleaning and Feature Engineering is required
# Convert categorical variable to numeric


df["Gender"]=np.where(df["Sex"]=="male",0,1)

df["Embarked_cleaned"]=np.where(df["Embarked"]=="S",0,
                                  np.where(df["Embarked"]=="C",1,
                                           np.where(df["Embarked"]=="Q",2,3)
                                          )
                                 )

In [66]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender,Embarked_cleaned
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,1,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0,1


In [67]:
# drop irrelevant columns -  Name, Cabin, Sex, Embarked, Ticket
df= df.drop(columns=[ 'Name', 'Cabin','Sex','Embarked','Ticket'])

In [69]:
df.Embarked_cleaned.value_counts()

0    644
1    168
2     77
3      2
Name: Embarked_cleaned, dtype: int64

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       891 non-null    int64  
 1   Survived          891 non-null    int64  
 2   Pclass            891 non-null    int64  
 3   Age               714 non-null    float64
 4   SibSp             891 non-null    int64  
 5   Parch             891 non-null    int64  
 6   Fare              891 non-null    float64
 7   Gender            891 non-null    int64  
 8   Embarked_cleaned  891 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 62.8 KB


In [71]:
# Cleaning dataset of NaN. how='any' - if any value is null (all)
df=df.dropna(axis=0, how='any')

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       714 non-null    int64  
 1   Survived          714 non-null    int64  
 2   Pclass            714 non-null    int64  
 3   Age               714 non-null    float64
 4   SibSp             714 non-null    int64  
 5   Parch             714 non-null    int64  
 6   Fare              714 non-null    float64
 7   Gender            714 non-null    int64  
 8   Embarked_cleaned  714 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 55.8 KB


In [73]:
y=df.Survived

In [74]:
X=df.drop(columns=['Survived'])

In [75]:
# Split dataset in training and test datasets
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size=0.5,random_state=7)

In [76]:
x=int(time.time())

In [78]:
time.time()

1722578077.44282

In [79]:
time()

TypeError: 'module' object is not callable

In [80]:
X_test.shape

(357, 8)

In [81]:
# Instantiate the classifier
gnb = GaussianNB()
gnb.fit(X_train,y_train)


In [82]:
y_pred = gnb.predict(X_test)

In [83]:
y_pred


array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,

In [84]:
y_pred.sum()

129

In [85]:
# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (y_test != y_pred).sum(),
          100*(1-(y_test != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 357 points : 83, performance 76.75%


In [86]:
(357-83)/357

0.7675070028011205

In [87]:
gnb.score(X_test,y_test)

0.7675070028011205

In [88]:
gnb.score(X_train,y_train)

0.8067226890756303

In [89]:
gnb.score(X,y)

0.7871148459383753

In [90]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Gender', 'Embarked_cleaned'],
      dtype='object')

In [91]:
# use only 1 Feature
#X=df.drop(columns=['Survived','Pclass','Gender','Age','SibSp','Parch','Embark_cleaned'])
X=df.drop(columns=['PassengerId','Survived','Gender','Age','SibSp','Parch','Fare','Embarked_cleaned'])

In [92]:
X.shape

(714, 1)

In [93]:
X

Unnamed: 0,Pclass
0,3
1,1
2,3
3,1
4,3
...,...
885,3
886,2
887,1
889,1


In [94]:
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size=0.5, random_state=int(time.time()))

In [95]:
X_test.shape

(357, 1)

In [96]:
X_train.shape

(357, 1)

In [97]:
gnb1 = GaussianNB()
gnb1.fit(X_train,y_train)
y_pred = gnb1.predict(X_test)
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (y_test != y_pred).sum(),
          100*(1-(y_test != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 357 points : 125, performance 64.99%


In [98]:
gnb1.score(X_test,y_test)

0.6498599439775911

In [99]:
gnb1.score(X_train,y_train)

0.7002801120448179

In [100]:
gnb1.score(X,y)

0.6750700280112045

In [101]:
X.head()


Unnamed: 0,Pclass
0,3
1,1
2,3
3,1
4,3


In [102]:
X.iloc[[0]]

Unnamed: 0,Pclass
0,3



Many Scikit-learn models, such as Tree-based methods, ensemble methods, kNN, and Naive Bayes have a predict_proba method; which can be used to infer the class probabilities (i.e. the probability that a particular data point falls into the underlying classes).

In [103]:
gnb1.predict_proba(X.iloc[[0]])

array([[0.77293403, 0.22706597]])

In [104]:
gnb1.predict_proba(X.loc[[3]])

array([[0.16401048, 0.83598952]])

# Pro and cons of Naive Bayes Classifiers
Pros:

Computationally fast

Simple to implement

Works well with small datasets

Works well with high dimensions

Perform well even if the Naive Assumption is not perfectly met. In many cases, the approximation is enough to build a good classifier.

Cons:

Require to remove correlated features because they are voted twice in the model and it can lead to over inflating importance.

If a categorical variable has a category in test data set which was not observed in training data set, then the model will assign a zero probability. It will not be able to make a prediction. 

In [105]:
# some data cleaning and Feature Engineering is required in test.csv also
# Convert categorical variable to numeric


dft["Gender"]=np.where(dft["Sex"]=="male",0,1)

dft["Embarked_cleaned"]=np.where(dft["Embarked"]=="S",0,
                                  np.where(dft["Embarked"]=="C",1,
                                           np.where(dft["Embarked"]=="Q",2,3)
                                          )
                                 )

In [106]:
dft

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender,Embarked_cleaned
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,2
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0,0


In [107]:
# drop irrelevant columns -  Name, Cabin, Sex, Embarked, Ticket from test dataframe
dft= dft.drop(columns=[ 'Name', 'Cabin','Sex','Embarked','Ticket'])

In [108]:
dft.Embarked_cleaned.value_counts()

0    270
1    102
2     46
Name: Embarked_cleaned, dtype: int64

In [109]:
dft.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       418 non-null    int64  
 1   Pclass            418 non-null    int64  
 2   Age               332 non-null    float64
 3   SibSp             418 non-null    int64  
 4   Parch             418 non-null    int64  
 5   Fare              417 non-null    float64
 6   Gender            418 non-null    int64  
 7   Embarked_cleaned  418 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 26.2 KB


In [110]:
# Cleaning dataset of NaN in test data
dft=dft.dropna(axis=0, how='any')

In [111]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331 entries, 0 to 415
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       331 non-null    int64  
 1   Pclass            331 non-null    int64  
 2   Age               331 non-null    float64
 3   SibSp             331 non-null    int64  
 4   Parch             331 non-null    int64  
 5   Fare              331 non-null    float64
 6   Gender            331 non-null    int64  
 7   Embarked_cleaned  331 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 23.3 KB


In [112]:
predictions = gnb.predict(dft)

In [113]:
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,

In [114]:
def writeCSV(predictions):
    outputDF = pd.DataFrame(np.column_stack([dft['PassengerId'], predictions]), columns=['PassengerId', 'Survived'])
    outputDF.to_csv('predictions.csv', index=False)

In [115]:
writeCSV(predictions)

In [116]:
predDF = pd.read_csv('predictions.csv')

In [117]:
predDF

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
326,1301,1
327,1303,1
328,1304,1
329,1306,1
