In [5]:
import numpy as np
import pandas as pd

In [6]:
csv_path = "C://Users/saurmisr/Downloads/titanic.csv"
data = pd.read_csv(csv_path)

In [7]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [8]:
#Checking for missing data for all relevant features and the label
print(data['Pclass'].isna().sum())
print(data['Siblings/Spouses Aboard'].isna().sum())
print(data['Parents/Children Aboard'].isna().sum())
print(data['Age'].isna().sum())
print(data['Fare'].isna().sum())
print(data['Survived'].isna().sum())

0
0
0
0
0
0


### FEATURE SET 1 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex, Siblings/Spouses and Parents/Children columns as relevant indicators
#### 2. We shall convert all these to Boolean features. 

In [9]:
yTrain = data['Survived'].to_numpy()
PclassCol = data['Pclass'].apply(lambda X: 0 if X==1 else 1).to_numpy()
siblings_or_spousesCol = data['Siblings/Spouses Aboard'].apply(lambda X: 1 if X>0 else 0).to_numpy()
parents_or_childrenCol = data['Parents/Children Aboard'].apply(lambda X: 1 if X>0 else 0).to_numpy()
sexCol = data['Sex'].apply(lambda X: 1 if X=='male' else 0).to_numpy()
XTrainFeatureSet1 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol))
(m,n) = XTrainFeatureSet1.shape
print(f'(m,n): {(m,n)}')

(m,n): (887, 4)


### FEATURE SET 2 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex,Age,Siblings/Spouses and Parents/Children columns as relevant indicators
#### 2. We shall convert all these to Boolean features. 

In [10]:
meanAge = data['Age'].mean()
medianAge = data['Age'].median()
print(f'Mean Age: {meanAge}')
print(f'Median Age: {medianAge}')
AgeCol = data['Age'].apply(lambda X: 1 if X<40 else 0)
XTrainFeatureSet2 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol,AgeCol))
(m,n) = XTrainFeatureSet2.shape
print(f'(m,n): {(m,n)}')

Mean Age: 29.471443066516347
Median Age: 28.0
(m,n): (887, 5)


### FEATURE SET 3 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex,Age,Siblings/Spouses, Parents/Children and Fare columns as relevant indicators
#### 2. We shall convert all these to Boolean features. 

In [11]:
meanFare =data['Fare'].mean() 
medianFare = data["Fare"].median()
print(f'Mean fare: {meanFare}')
print(f'Median Fare: {medianFare}')
FareCol = data['Fare'].apply(lambda X: 1 if X<medianFare else 0)
XTrainFeatureSet3 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol,AgeCol,FareCol))
(m,n) = XTrainFeatureSet3.shape
print(f'(m,n): {(m,n)}')

Mean fare: 32.30542018038328
Median Fare: 14.4542
(m,n): (887, 6)


In [12]:
print(f'Checking Passenger class column: {np.sum((PclassCol)==0)+np.sum((PclassCol)==1)}')
print(f'Checking siblings/spouses column: {np.sum((siblings_or_spousesCol)==0)+np.sum((siblings_or_spousesCol)==1)}')
print(f'Checking parents/children column: {np.sum((parents_or_childrenCol)==0)+np.sum((parents_or_childrenCol)==1)}')
print(f'Checking Age column: {np.sum((AgeCol)==0)+np.sum(AgeCol==1)}')
print(f'Checking Fare column: {np.sum((FareCol)==0)+np.sum((FareCol)==1)}')
print(f'Checking Sex column: {np.sum((sexCol)==0)+np.sum((sexCol)==1)}')


Checking Passenger class column: 887
Checking siblings/spouses column: 887
Checking parents/children column: 887
Checking Age column: 887
Checking Fare column: 887
Checking Sex column: 887


In [13]:
def pY(Y, C):
    '''Returns the MLE-estimated probability that Y is in class C 
    '''
    N=Y.size
    return np.sum(Y==C)/N

In [14]:
def tabulate_pX_given_Y(X,Y,beta_values):
    '''
    Returns the MAP-estimated conditional probabilities D = [P(Xi=1|Y=y)]
    D is a 2xn numpy array where D(y,i) = P(Xi=1|Y=y)    0<=y<=1 , 0<=i<n  
    '''     
    classes = np.unique(Y)
    nClasses = classes.size
    n = X.shape[1]
    Theta = np.zeros((nClasses,n))
    for y in range(nClasses):       
        row_ids_where_Y_equals_y = np.argwhere(Y==classes[y]) # Find the row numbers where Y=y 
        Ny = row_ids_where_Y_equals_y.shape[0] 
        for i in range(n):                        
            count_xi = 0
            for j in range(Ny):
                idx = row_ids_where_Y_equals_y[j]   #Convert row number to row index             
                if X[idx,i] == 1:            
                    count_xi +=1             
#             Theta[y,i] = (count_xi+beta_values[0]-1)/(Ny+sum(beta_values)-2) # MAP estimate
            Theta[y,i] = count_xi/Ny # ML estimate
    return Theta

In [15]:
D_feature_set1  =tabulate_pX_given_Y(XTrainFeatureSet1,yTrain,None)
D_feature_set2  =tabulate_pX_given_Y(XTrainFeatureSet2,yTrain,None)
D_feature_set3  =tabulate_pX_given_Y(XTrainFeatureSet3,yTrain,None)
print(D_feature_set1)
print(D_feature_set2)
print(D_feature_set3)


[[0.85321101 0.27706422 0.19082569 0.85137615]
 [0.60233918 0.38596491 0.31871345 0.31871345]]
[[0.85321101 0.27706422 0.19082569 0.85137615 0.77247706]
 [0.60233918 0.38596491 0.31871345 0.31871345 0.78947368]]
[[0.85321101 0.27706422 0.19082569 0.85137615 0.77247706 0.59816514]
 [0.60233918 0.38596491 0.31871345 0.31871345 0.78947368 0.3245614 ]]


In [16]:
def findObservedDataScore(x,D,log_p0,log_p1):
    '''
    Returns score  = [P(X1=x1|Y=0)...P(Xn=xn|Y=0).P(Y=0), P(X1=x1|Y=1)...P(Xn=xn|Y=1).P(Y=1)] as a list with 2 elements
    '''
    #MAP estimate
    class0_score = x.dot(np.log10(D[0,:])) + (np.ones(x.shape)-x).dot(1-np.log10(D[0,:])) + log_p0
    class1_score = x.dot(np.log10(D[1,:])) + (np.ones(x.shape)-x).dot(1-np.log10(D[1,:])) + log_p1
    
    #MLE estimate
#     class0_score = x.dot(np.log10(D[0,:])) + (np.ones(x.shape)-x).dot(1-np.log10(D[0,:])) 
#     class1_score = x.dot(np.log10(D[1,:])) + (np.ones(x.shape)-x).dot(1-np.log10(D[1,:])) 
    
    return (class0_score,class1_score)                

In [17]:
def predict(X,D,logp0,logp1):
    M = X.shape[0]
    predictions = np.zeros((M,1))
    for m in range(M):
        if m%10==0:
            pass
#             print(f'm: {m}')
        scores = findObservedDataScore(X[m,:],D,logp0,logp1)
        if scores[0]>scores[1]:
            predictions[m] = 0
        else:
            predictions[m] = 1
    return predictions
    

In [18]:
def getErrorRate(yPred,yActual):        
    return np.sum(np.abs(yPred[:,0]-yActual))/yPred.size

In [19]:
(logp0,logp1) = np.log10(pY(yTrain,0)),np.log10(pY(yTrain,1))
y_pred_train_feature_set1 = predict(XTrainFeatureSet1,D_feature_set1,logp0,logp1)
y_pred_train_feature_set2 = predict(XTrainFeatureSet2,D_feature_set2,logp0,logp1)
y_pred_train_feature_set3 = predict(XTrainFeatureSet3,D_feature_set3,logp0,logp1)

In [20]:
error_rate_FestureSet1 = getErrorRate(y_pred_train_feature_set1,yTrain)
error_rate_FestureSet2 = getErrorRate(y_pred_train_feature_set2,yTrain)
error_rate_FestureSet3 = getErrorRate(y_pred_train_feature_set3,yTrain)
print('ERROR RATES:')
print(f'Feature Set 1: {error_rate_FestureSet1}')
print(f'Feature Set 2: {error_rate_FestureSet2}')
print(f'Feature Set 3: {error_rate_FestureSet3}')

ERROR RATES:
Feature Set 1: 0.2705749718151071
Feature Set 2: 0.26719278466741825
Feature Set 3: 0.26268320180383314


In [21]:
print('SUCCESS RATES:')
print(f'Feature Set 1: {1-error_rate_FestureSet1}')
print(f'Feature Set 2: {1-error_rate_FestureSet2}')
print(f'Feature Set 3: {1-error_rate_FestureSet3}')

SUCCESS RATES:
Feature Set 1: 0.7294250281848929
Feature Set 2: 0.7328072153325818
Feature Set 3: 0.7373167981961668


### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
X = data.drop(labels=['Survived','Name'],axis=1)
X['Sex']=X['Sex'].apply(lambda X:0 if X=='male' else 0)
y = data['Survived']
clf.fit(X,y)
y_pred_GNB = clf.predict(X)
error_rate_GNB = np.sum(np.abs(y_pred_GNB-y))/y.size
print(f'Error Rate: {error_rate_GNB}')
print(f'Success Rate: {1-error_rate_GNB}')

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X,y)
y_pred_LR = clf.predict(X)
error_rate_LR = np.sum(np.abs(y_pred_LR-y))/y.size
print(f'Error Rate: {error_rate_LR}')
print(f'Success Rate: {1-error_rate_LR}')

### Estimate some Conditional Probabilities of the form P(Survival|Sex,Pclass)

In [None]:
def calculateSurvivalChanceGivenSexandPclass(data,Sex,Pclass):
    '''INPUT:
    data: pandas dataframe
    Sex : must be in ['male','female']
    Pclass: must be in [1,2,3]
    OUTPUT: P(Survival|Sex,Pclass) estimated using MLE
    '''
    assert Sex in ['male','female'], "Sex should be male/female"
    assert Pclass in [1,2,3], "Passenger Class should be 1/2/3"
    
    #Pure Numpy
    PclassCol = data.Pclass.to_numpy()
    SexCol = data.Sex.to_numpy()
    SurvivedCol = data.Survived.to_numpy()
    n_sex_pClass = np.sum((PclassCol==Pclass) & (SexCol==Sex))
    n_survived_sex_pclass = np.sum((PclassCol==Pclass) & (SexCol==Sex) & (SurvivedCol==1))
    
    #Pure Pandas
#     n_sex_pClass = data.loc[(data['Pclass']==Pclass) & (data['Sex']==Sex)].shape[0]
#     n_survived_sex_pclass = data.loc[(data['Pclass']==Pclass) & (data['Sex']==Sex) & (data['Survived']==1)].shape[0]
    
    return n_survived_sex_pclass/n_sex_pClass

In [None]:
calculateSurvivalChanceGivenSexandPclass(data,'male',1)