In [276]:
import numpy as np
import pandas as pd

In [277]:
csv_path = "C://Users/saurmisr/Downloads/titanic.csv"
data = pd.read_csv(csv_path)

In [278]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [279]:
#Checking for missing data for all relevant features and the label
print(data['Pclass'].isna().sum())
print(data['Siblings/Spouses Aboard'].isna().sum())
print(data['Parents/Children Aboard'].isna().sum())
print(data['Age'].isna().sum())
print(data['Fare'].isna().sum())
print(data['Survived'].isna().sum())

0
0
0
0
0
0


### FEATURE SET 1 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex, Siblings/Spouses and Parents/Children columns as relevant indicators

In [280]:
yTrain = data['Survived'].to_numpy()
PclassCol = data['Pclass'].to_numpy()
siblings_or_spousesCol = data['Siblings/Spouses Aboard'].to_numpy()
parents_or_childrenCol = data['Parents/Children Aboard'].to_numpy()
sexCol = data['Sex'].apply(lambda X: 1 if X=='male' else 0).to_numpy()
XTrainFeatureSet1 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol))
(m,n) = XTrainFeatureSet1.shape
print(f'(m,n): {(m,n)}')

(m,n): (887, 4)


### FEATURE SET 2 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex,Age,Siblings/Spouses and Parents/Children columns as relevant indicators
#### 2. We shall convert Age to a Boolean feature. 

In [281]:
meanAge = data['Age'].mean()
medianAge = data['Age'].median()
print(f'Mean Age: {meanAge}')
print(f'Median Age: {medianAge}')
AgeCol = data['Age'].apply(lambda X: 1 if X<40 else 0)
XTrainFeatureSet2 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol,AgeCol))
(m,n) = XTrainFeatureSet2.shape
print(f'(m,n): {(m,n)}')

Mean Age: 29.471443066516347
Median Age: 28.0
(m,n): (887, 5)


### FEATURE SET 3 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex,Age,Siblings/Spouses, Parents/Children and Fare columns as relevant indicators
#### 2. We shall convert Age and Fare to Boolean features. 

In [282]:
meanFare =data['Fare'].mean() 
medianFare = data["Fare"].median()
print(f'Mean fare: {meanFare}')
print(f'Median Fare: {medianFare}')
FareCol = data['Fare'].apply(lambda X: 1 if X<medianFare else 0)
XTrainFeatureSet3 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol,AgeCol,FareCol))
(m,n) = XTrainFeatureSet3.shape
print(f'(m,n): {(m,n)}')

Mean fare: 32.30542018038328
Median Fare: 14.4542
(m,n): (887, 6)


In [283]:
def pY(Y, C):
    '''Returns the MLE-estimated probability that Y is in class C 
    '''
    N=Y.size
    return np.sum(Y==C)/N

In [292]:
def findMaxFeatureSetSize(X):
    J=0
    n = X.shape[1]
    for featureId in range(n):
        N_distinct_feature_values = np.size(np.unique(X[:,featureId]))
        if J< N_distinct_feature_values:
            J = N_distinct_feature_values
    return J
    

In [285]:
def getFeatureSpace(X):
    n=X.shape[1]
    J = findMaxFeatureSetSize(X)
    S = np.zeros((n,J))
    for featureId in range(n):
        unique_feature_values  = np.unique(X[:,featureId])
        N_values = unique_feature_values.size
        S[featureId,:N_values] = unique_feature_values
    
    return S

In [286]:
def tabulate_pX_given_Y(X,Y,beta_values):
    '''
    Returns the ML-estimated conditional probabilities Theta(i,j,k) = [P(Xi=xj|Y=k)], Theta(i,j,k) = P(Xi=xj|Y=k)
    Theta is an nxJxK numpy array where Thetak(i,j,k) = P(Xi=xj|Y=k)  0<=i<n , 0<=j<J , 0<=k<K
    '''     
    classes = np.unique(Y)
    K = classes.size
    n = X.shape[1]
    J= findMaxFeatureSetSize(X)
    featureSpace = getFeatureSpace(X)
    Theta  = np.zeros((n,J,K))
    for k in range(K):  
        row_ids_where_Y_equals_k = np.argwhere(Y==classes[k]) # Find the row numbers where Y=y 
        Ny = row_ids_where_Y_equals_k.shape[0] 
        for i in range(n):
            for m in range(Ny):
                rowId = row_ids_where_Y_equals_k[m][0]
                j = np.where(featureSpace[i,:]==X[rowId,i])[0][0]
                Theta[i,j,k]+=1
        Theta[:,:,k]/=Ny
        
    return Theta

In [293]:
Theta1 = tabulate_pX_given_Y(XTrainFeatureSet1,yTrain,None)
Theta2 = tabulate_pX_given_Y(XTrainFeatureSet2,yTrain,None)
Theta3 = tabulate_pX_given_Y(XTrainFeatureSet3,yTrain,None)

In [298]:
def findObservedDataScore(x,Theta,logProb,featureSpace):
    '''
    Returns scores  = [P(X1=x1|Y=1)...P(Xn=xn|Y=1).P(Y=1),...,P(X1=x1|Y=K)...P(Xn=xn|Y=K).P(Y=K)], a 1xK numpy array
    '''
    n,J,K = Theta.shape
    scores = np.zeros(K)
    for k in range(K):
        score=0
        for i in range(n):
            j = np.where(featureSpace[i,:]==x[i])[0][0]
            score+=np.log(Theta[i,j,k])
        score+=logProb[k]
        scores[k]=score
    
    return scores              

In [299]:
S1 = getFeatureSpace(XTrainFeatureSet1)
S2 = getFeatureSpace(XTrainFeatureSet2)
S3 = getFeatureSpace(XTrainFeatureSet3)
logProb = np.log10(pY(yTrain,0)),np.log10(pY(yTrain,1))

In [300]:
def predict(X,Theta,logProb,featureSpace):
    M = X.shape[0]
    predictions = np.zeros((M,1))
    for m in range(M):
        scores = findObservedDataScore(X[m,:],Theta,logProb,featureSpace)
        if scores[0]>scores[1]:
            predictions[m] = 0
        else:
            predictions[m] = 1
    return predictions
    

In [301]:
y_pred_train_1 = predict(XTrainFeatureSet1,Theta1,logProb,S1)
y_pred_train_2 = predict(XTrainFeatureSet2,Theta2,logProb,S2)
y_pred_train_3 = predict(XTrainFeatureSet3,Theta3,logProb,S3)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
def getErrorRate(yPred,yActual):        
    return np.sum(np.abs(yPred[:,0]-yActual))/yPred.size

In [302]:
print('ERROR RATES:')
print(f'FEATURE SET 1 : {getErrorRate(y_pred_train_1,yTrain)} ')
print(f'FEATURE SET 2 : {getErrorRate(y_pred_train_2,yTrain)} ')
print(f'FEATURE SET 3 : {getErrorRate(y_pred_train_3,yTrain)} ')

ERROR RATES:
FEATURE SET 1 : 0.20969560315670802 
FEATURE SET 2 : 0.21195039458850057 
FEATURE SET 3 : 0.24126268320180383 


In [303]:
print('SUCCESS RATES:')
print(f'FEATURE SET 1 : {1-getErrorRate(y_pred_train_1,yTrain)} ')
print(f'FEATURE SET 2 : {1-getErrorRate(y_pred_train_2,yTrain)} ')
print(f'FEATURE SET 3 : {1-getErrorRate(y_pred_train_3,yTrain)} ')

SUCCESS RATES:
FEATURE SET 1 : 0.790304396843292 
FEATURE SET 2 : 0.7880496054114994 
FEATURE SET 3 : 0.7587373167981961 
