In [None]:
import numpy as np
import pandas as pd

In [None]:
csv_path = "C://Users/saurmisr/Downloads/titanic.csv"
data = pd.read_csv(csv_path)

In [None]:
data.head()

In [None]:
#Checking for missing data for all relevant features and the label
def checkMissingData(X,cols):
    '''Checks if columns have missing values
    INPUT: X - a 2D numpy array, cols - list/tuple/array of column names
    OUTPUT: Prints the number of missing entries in each column of cols
    '''
    total_missing_entries = 0
    for col in cols:
        total_missing_entries += data[col].isna().sum()
        print(f'column {col}: Found {total_missing_entries} missing entries.')
    
    print(f'Found a total of {total_missing_entries} missing entries in the data, corresponding to columns {cols}.')

In [None]:
def pY(Y, k):
    '''Returns the ML-estimated probability that Y is in class k : P(Y=k)
    '''
    N=Y.size
    return np.sum(Y==k)/N

In [None]:
def findMaxFeatureSetSize(X,discreteFeatureIds):
    '''
    Finds J, the maximum among all the different number of values each discrete feature can take.
    INPUT : X, an mxn numpy array of data, 
            discreteFeatureIds - a list of the column numbers of X corresponding to discrete features
    '''
    J=0
    for featureId in discreteFeatureIds:
        N_distinct_feature_values = np.size(np.unique(X[:,featureId]))
        if J< N_distinct_feature_values:
            J = N_distinct_feature_values
    return J
    

In [None]:
def getFeatureSpace(X,discreteFeatureIds):
    '''
    Returns the feature space for discrete features
    INPUT : X, an mxn numpy array of data, 
            discreteFeatureIds - a list of the column numbers of X corresponding to discrete features
    OUTPUT: S is an nDxJ numpy array, where S[i,:] are all the possible values that the ith discrete feature takes on.
    '''
    nD=len(discreteFeatureIds)
    J = findMaxFeatureSetSize(X,discreteFeatureIds)
    S = np.zeros((n,J))
    for featureId in range(nD):
        unique_feature_values  = np.unique(X[:,discreteFeatureIds[featureId]])
        N_values = unique_feature_values.size
        S[featureId,:N_values] = unique_feature_values
    
    return S

In [None]:
def tabulate_pX_given_Y(X,Y,beta_values,discreteFeatureIds):
    '''
    Returns the ML-estimated conditional probabilities Theta(i,j,k) = [P(Xi=xj|Y=k)], Theta(i,j,k) = P(Xi=xj|Y=k)
    for the discrete-valued features
     INPUT : X, an mxn numpy array of data, 
             Y, an mx1 numpy array of labels,
             beta_values : a list of beta parameters for MAP estimation 
             discreteFeatureIds - a list of the column numbers of X corresponding to discrete features
    OUTPUT: Theta is an nDxJxK numpy array where Thetak(i,j,k) = P(Xi=xj|Y=k)  0<=i<nD , 0<=j<J , 0<=k<K
    '''     
    classes = np.unique(Y)
    K = classes.size
    nD = len(discreteFeatureIds)
    J= findMaxFeatureSetSize(X,discreteFeatureIds)
    featureSpace = getFeatureSpace(X,discreteFeatureIds)
    Theta  = np.zeros((nD,J,K))
    for k in range(K):  
        row_ids_where_Y_equals_k = np.argwhere(Y==classes[k]) # Find the row numbers where Y=y 
        Ny = row_ids_where_Y_equals_k.shape[0] 
        for i in range(nD):
            for m in range(Ny):
                rowId = row_ids_where_Y_equals_k[m][0]
                j = np.where(featureSpace[i,:]==X[rowId,discreteFeatureIds[i]])[0][0]
                Theta[i,j,k]+=1
        Theta[:,:,k]/=Ny
        
    return Theta

In [None]:
def getSampleMeans(X,Y, continuousFeatureIds):
    '''
    Returns the ML-estimated sample mean E[Xi|Y=k], k=0,...,K-1 for continuous features Xi, i=1,...,nC
    INPUT : X, an mxn numpy array of data, 
            Y, an mx1 numpy array of labels,             
            continuousFeatureIds - a list of the column numbers of X corresponding to continuous features
    OUTPUT: means is an nCxK numpy array where mu(i,k) = E[Xi|Y=k] k=0,...,K-1 for continuous features Xi, i=1,...,nC
    ''' 
    classes = np.unique(Y)
    K = classes.size
    nC = len(continuousFeatureIds)
    means = np.zeros((nC,K))
    for k in range(K):
        for i in range(nC):
            X_filtered = X[(Y==classes[k])]
            means[i,k] = X_filtered[:,continuousFeatureIds[i]].mean()
    return means

In [None]:
def getSampleVariances(X,Y, continuousFeatureIds):
    '''
    Returns the ML-estimated sample Variance E[(Xi-mui)|Y=k], k=0,...,K-1 for continuous features Xi, i=1,...,nC
    INPUT : X, an mxn numpy array of data, 
            Y, an mx1 numpy array of labels,             
            continuousFeatureIds - a list of the column numbers of X corresponding to continuous features
    OUTPUT:
    var is an nCxK numpy array where mu(i,k) = E[(Xi-mui)|Y=k] k=0,...,K-1 for continuous features Xi, i=1,...,nC
    ''' 
    classes = np.unique(Y)
    K = classes.size
    nC = len(continuousFeatureIds)
    var  = np.zeros((nC,K))
    for k in range(K):
        for i in range(nC):
            X_filtered = X[(Y==classes[k])]
            var[i,k] = X_filtered[:,continuousFeatureIds[i]].var(ddof=1)
    return var

In [None]:
def getScoreForContinuousFeature(x,mean,variance):
    '''
    Finds the class-conditional score for a continuous feature
    INPUT : x - the value of the continuous feature
            mean - the class-conditional sample mean for the continuous feature
            variance - the class-conditional sample variance for the continuous feature
    OUPUT : log(P(X=x|Y=k)), for some continuous feature X. Assumes X~Normal(mean,variance)
    '''
    
    return (-np.log(np.sqrt(np.pi*variance)) - ((x-mean)**2)/(2*variance))
    

In [None]:
def findObservedDataScore(x,Theta,logProb,featureSpace,discreteFeatureIds,continuousFeatureIds,means,variances):
    '''
    Finds the score for one observation (X1,...,Xn)
    INPUT : x , a 1xn numpy array of observation
            Theta, the nDxJxK numpy array of conditional probabilities for discrete features P(Xi=xj|Y=k)
            logProb, a 1xK list of the log of the respective class proabilities
            featureSpace , the nDxJ numpy array of the feature space of the discrete features
            discreteFeatureIds, a list of the column numbers of x corresponding to discrete features
            continuousFeatureIds - a list of the column numbers of x corresponding to continuous features
            means - an nCxK numpy array of the class-conditional means for each continuous feature
            variances - an nCxK numpy array of the class-conditional variances for each continuous feature 
            
    OUTPUT: scores  = [P(X1=x1|Y=1)...P(Xn=xn|Y=1).P(Y=1),...,P(X1=x1|Y=K)...P(Xn=xn|Y=K).P(Y=K)], a 1xK numpy array
    '''
    n,J,K = Theta.shape
    scores = np.zeros(K)
    for k in range(K):
        score=0
        # Counting the scores from the discrete-valued features
        for i in discreteFeatureIds:
            j = np.where(featureSpace[i,:]==x[i])[0][0]
            score+=np.log(Theta[i,j,k])
        
        # Counting the scores from the continuous-valued features
        N_cont_features = len(continuousFeatureIds)
        for i in range(N_cont_features):
#             print(f'i:{i}')
#             print('Input:')
#             print(f'{x[continuousFeatureIds[i]]}')
#             print(f'{means[i,k]}')
#             print(f'{variances[i,k]}')
#             print(f'{(x[continuousFeatureIds[i]],means[i,k],variances[i,k])}')
            score += getScoreForContinuousFeature(x[continuousFeatureIds[i]],means[i,k],variances[i,k])
                
        
        score+=logProb[k]
        scores[k]=score
    
    return scores              

In [None]:
def predict(X,Theta,logProb,featureSpace,discreteFeatureIds,continuousFeatureIds,means,variances):
    '''
    Returns class predictions for each observation
    INPUT : X , a mxn numpy array of observations
            Theta, the nDxJxK numpy array of conditional probabilities for discrete features P(Xi=xj|Y=k)
            logProb, a 1xK list of the log of the respective class proabilities
            featureSpace , the nDxJ numpy array of the feature space of the discrete features
            discreteFeatureIds, a list of the column numbers of x corresponding to discrete features
            continuousFeatureIds - a list of the column numbers of x corresponding to continuous features
            means - an nCxK numpy array of the class-conditional means for each continuous feature
            variances - an nCxK numpy array of the class-conditional variances for each continuous feature 
            
    OUTPUT:
    predictions - a 1xm numpy array of class predictions
    '''
    M = X.shape[0]
    predictions = np.zeros((M,1))
    for m in range(M):
        scores = findObservedDataScore(X[m,:],Theta,logProb,featureSpace,discreteFeatureIds,continuousFeatureIds,means,variances)
#         if scores[0]>scores[1]:
#             predictions[m] = 0
#         else:
#             predictions[m] = 1
        predictions[m]=np.argmax(scores)
    return predictions
    

In [None]:
def getErrorRate(yPred,yActual):        
    return np.sum(np.abs(yPred[:,0]-yActual))/yPred.size

### GLOBAL PARAMETERS:

In [None]:
logProb = np.log10(pY(yTrain,0)),np.log10(pY(yTrain,1))

### FEATURE SET 1 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex, Siblings/Spouses and Parents/Children columns as relevant indicators

In [None]:
yTrain = data['Survived'].to_numpy()
PclassCol = data['Pclass'].to_numpy()
siblings_or_spousesCol = data['Siblings/Spouses Aboard'].to_numpy()
parents_or_childrenCol = data['Parents/Children Aboard'].to_numpy()
sexCol = data['Sex'].apply(lambda X: 1 if X=='male' else 0).to_numpy()
XTrainFeatureSet1 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol))
(m,n) = XTrainFeatureSet1.shape
print(f'Feature Set 1 size (m,n): {(m,n)}')
columnNames_1 = ['Survived','Pclass','Sex','Siblings/Spouses Aboard','Parents/Children Aboard']
checkMissingData(data,columnNames_1)

discrete_Features_1 = [0,1,2,3]
continuous_features_1 = []
beta_vals = None
Theta1 = tabulate_pX_given_Y(XTrainFeatureSet1,yTrain,beta_vals,discrete_Features_1)
S1 = getFeatureSpace(XTrainFeatureSet1,discrete_Features_1)
(means_1,variances_1) = (None,None)

### FEATURE SET 2 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex,Age,Siblings/Spouses and Parents/Children columns as relevant indicators

In [None]:
#NOTE: This assumes that Feature Set 1 has been created
AgeCol = data['Age']
XTrainFeatureSet2 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol,AgeCol))
(m,n) = XTrainFeatureSet2.shape
print(f'Feature Set 2 size (m,n): {(m,n)}')
columnNames_2 = ['Survived','Pclass','Sex','Age','Siblings/Spouses Aboard','Parents/Children Aboard']
checkMissingData(data,columnNames_2)
discrete_Features_2 = [0,1,2,3]
continuous_features_2 = [4]
beta_vals = None
Theta2 = tabulate_pX_given_Y(XTrainFeatureSet2,yTrain,beta_vals,discrete_Features_2)
means_2 = getSampleMeans(XTrainFeatureSet2,yTrain,continuous_features_2)
variances_2 = getSampleVariances(XTrainFeatureSet2,yTrain,continuous_features_2)
S2 = getFeatureSpace(XTrainFeatureSet2,discrete_Features_2)

### FEATURE SET 3 ASSUMPTIONS:
#### 1. We shall only consider the Passenger Class, Sex,Age,Siblings/Spouses, Parents/Children and Fare columns as relevant indicators

In [None]:
#NOTE: This assumes that Feature Sets 1 and 2 have been created
FareCol = data['Fare']
XTrainFeatureSet3 = np.column_stack((PclassCol,siblings_or_spousesCol,parents_or_childrenCol,sexCol,AgeCol,FareCol))
(m,n) = XTrainFeatureSet3.shape
print(f'Feature Set 3 size (m,n): {(m,n)}')
columnNames_3 = ['Survived','Pclass','Sex','Age','Siblings/Spouses Aboard','Parents/Children Aboard','Fare']
checkMissingData(data,columnNames_3)
discrete_Features_3 = [0,1,2,3]
continuous_features_3 = [4,5]
beta_vals = None
Theta3 = tabulate_pX_given_Y(XTrainFeatureSet3,yTrain,beta_vals,discrete_Features_3)
means_3 = getSampleMeans(XTrainFeatureSet3,yTrain,continuous_features_3)
variances_3 = getSampleVariances(XTrainFeatureSet3,yTrain,continuous_features_3)
S3 = getFeatureSpace(XTrainFeatureSet3,discrete_Features_3)

### PREDICTIONS

In [None]:
y_pred_train_1 = predict(XTrainFeatureSet1,Theta1,logProb,S1,discrete_Features_1,continuous_features_1,means_1,variances_1)
y_pred_train_2 = predict(XTrainFeatureSet2,Theta2,logProb,S2,discrete_Features_2,continuous_features_2,means_2,variances_2)
y_pred_train_3 = predict(XTrainFeatureSet3,Theta3,logProb,S3,discrete_Features_3,continuous_features_3,means_3,variances_3)

In [None]:
print('ERROR RATES:')
print(f'FEATURE SET 1 : {getErrorRate(y_pred_train_1,yTrain)} ')
print(f'FEATURE SET 2 : {getErrorRate(y_pred_train_2,yTrain)} ')
print(f'FEATURE SET 3 : {getErrorRate(y_pred_train_3,yTrain)} ')
print('SUCCESS RATES:')
print(f'FEATURE SET 1 : {1-getErrorRate(y_pred_train_1,yTrain)} ')
print(f'FEATURE SET 2 : {1-getErrorRate(y_pred_train_2,yTrain)} ')
print(f'FEATURE SET 3 : {1-getErrorRate(y_pred_train_3,yTrain)} ')

### SK-LEARN

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
X = data.drop(labels=['Survived','Name'],axis=1)
X['Sex']=X['Sex'].apply(lambda X:0 if X=='male' else 0)
y = data['Survived']
clf.fit(X,y)
y_pred_GNB = clf.predict(X)
error_rate_GNB = np.sum(np.abs(y_pred_GNB-y))/y.size
print(f'Error Rate: {error_rate_GNB}')
print(f'Success Rate: {1-error_rate_GNB}')

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X,y)
y_pred_LR = clf.predict(X)
error_rate_LR = np.sum(np.abs(y_pred_LR-y))/y.size
print(f'Error Rate: {error_rate_LR}')
print(f'Success Rate: {1-error_rate_LR}')