In [179]:
#Importing the libraries required
import numpy as np
import matplotlib as plt
import pandas as pd
import random
import sklearn
import math
from sklearn.linear_model import LinearRegression
from scipy.stats import multivariate_normal

<b>PREPARING THE DATA FOR THE MODEL</b>

In [180]:
#Created a dataframe using the data provided in the table. Added noise to the Salary column, with the noise being Normal distribution
header=["Gender_ID","Degree","Age","Salary"]
#The noise array with five elements, to be added to the salary column.
noise=np.random.normal(0,1,5) 

training_data=[
    [-1,-1,+1,-1,+1],
    [2,3,1,1,2],
    [36,47,26,68,33],
    [89.563,123.543,23.989,138.769,113.888]
]


df=pd.DataFrame(training_data).transpose()
df.columns=header


#Adding noise to the data.
df["Salary"]=[df["Salary"][i]+noise[i] for i in range(len(df))]
df.head()
#Converting the data types of each column to integer, except the salary colum
data_types={"Gender_ID":int,"Degree":int,"Age":int}
df.astype(data_types)

Unnamed: 0,Gender_ID,Degree,Age,Salary
0,-1,2,36,89.633172
1,-1,3,47,123.979728
2,1,1,26,21.949434
3,-1,1,68,138.377424
4,1,2,33,113.772651


In [181]:
#Made three different attribute datasets: One with only the Age column, one with the Age and Degree column, one with the age,degree and the Gender ID column. 
X1=np.array(df[["Age"]]).reshape(-1,1)
X1=X1.astype("int")
X2=np.array(df[["Age","Degree"]]).reshape(-1,2)
X2=X2.astype("int")
X3=np.array(df[["Age","Degree","Gender_ID"]]).reshape(-1,3)
X3=X3.astype("int")
#the column with the Salary values.
Y=np.array(df["Salary"]).reshape(-1,1)
#print(X3)

<b>STRAIGHT CURVE FITTING REGRESSION</b>

In [201]:
#For stratight curve fitting regression, I used the existing LinearRegression library in python. The regression model in this cell is for data with only Age column.
regr=LinearRegression()
regr.fit(X1,Y)
#The score evaluating the linear fit for X1 data.
score_SCF_X1=regr.score(X1,Y)
#prediction_SCF_Xi is the prediction by the linear model for X1 using the Straight curve fitting method. It has the five predictions it made for the five rows of data set in the training set./
prediction_SCF_X1=regr.predict(np.array(X1))
prediction_SCF_X1=[prediction_SCF_X1[i][0] for i in range(len(X1))]
print("The prediction by the SCF model on data X1:")
print(prediction_SCF_X1)
print("\n")

#This gives the prediction of salary for age 60, as asked in the question
y_test=regr.predict(np.array([[60]]))
print("The salary predicted by the straight curve when the age is 60 is:",y_test[0][0])


The prediction by the SCF model on data X1:
[84.66668391655118, 108.2723134326932, 63.20702072005845, 153.33760614532792, 78.22878495760337]


The salary predicted by the straight curve when the age is 60 is: 136.16987558813372


In [199]:
#The following two cells are the regression fits for the data X2,X3. They also have a prediction list for them, stored as prediction_<Model_used>_<dataset>
regr=LinearRegression()
regr.fit(X2,Y)
score_SCF_X2=regr.score(X2,Y)
prediction_SCF_X2=regr.predict(np.array(X2))
prediction_SCF_X2=[prediction_SCF_X2[i][0] for i in range(len(X2))]
print("The prediction by the SCF model on data X2:")
print(prediction_SCF_X2)

The prediction by the SCF model on data X2:
[89.12372023206069, 139.95875451124527, 40.555243081927685, 135.7506425020944, 82.32404884490593]


In [202]:
regr=LinearRegression()
regr.fit(X3,Y)
score_SCF_X3=regr.score(X3,Y)
prediction_SCF_X3=regr.predict(np.array(X3))
prediction_SCF_X3=[prediction_SCF_X3[i][0] for i in range(len(X3))]
print("The prediction by the SCF model on data X3:")
print(prediction_SCF_X3)

The prediction by the SCF model on data X3:
[74.44198221172991, 140.9070536359166, 40.612896487830184, 136.6412882208632, 95.10918861589408]


<b>MLE REGRESSION MODEL</b>

In [185]:
#This is the function to calculate the likelihood function, given the predicted outcomes, the actual values and the number of samples. It returns the log of the probability.
def calcLogLikelihood(guess,true,n):
    #error column is how much do the predicted values differ from the true values
    error=true-guess 
    sigma=np.std(error)
    #the formula for the likelihood
    f=((1.0/(2.0*math.pi*(sigma**2)))**(n/2))* \
        np.exp(-1*((np.dot(error.T,error))/(2*sigma*sigma)))
    return np.log(f)

In [186]:
#This cell has three different models, for the respective datasets. They take in the variable array and return the negative loglikelihood value for that set of variables.
def MLE_model1(var):
    x,y=np.array(df["Age"]),np.array(df["Salary"])
    #predicts the value of y on the basis of the X column.
    yguess=np.array([(var[1]*(x[i])+var[0]) for i in range(len(x))])
    f=calcLogLikelihood(yguess,y,float(len(yguess)))
    return (-1*f)

def MLE_model2(var):
    x1,x2,y=df["Age"],df["Degree"],df["Salary"]
    yguess=[0 for i in range(len(x1))]
    yguess=[(var[2]*x2[i]+var[1]*x1[i]+var[0]) for i in range(len(x1))]
    f=calcLogLikelihood(yguess,y,float(len(yguess)))
    return (-1*f)

def MLE_model3(var):
    x1,x2,x3,y=df["Age"],df["Degree"],df["Gender_ID"],df["Salary"]
    yguess=[0 for i in range(len(x1))]
    yguess=[(var[3]*x3[i]+var[2]*x2[i]+var[1]*x1[i]+var[0]) for i in range(len(x1))]
    f=calcLogLikelihood(yguess,y,float(len(yguess)))
    return (-1*f)

In [203]:
#The next three cells are for the estimation of the parameters using the lIkelihood function.  
# We minimize the -log  likelihood function instead of maximizing the loglikelihood function.
from scipy.optimize import minimize
nvar=2
var=np.zeros(nvar)
#Randomly picked two variable inputs.
var[0]=15
var[1]=2

#This is the minimizing function
res1=minimize(MLE_model1,var,method="BFGS",options={"disp":False})
#print(res1)
#res.x1 gives the parameters in an array
theta_MLE_1=res1.x
prediction_MLE_X1=[theta_MLE_1[0]+theta_MLE_1[1]*X1[i][0] for i in range(len(X1))]
print("The prediction by the MLE model on data X1:")
print(prediction_MLE_X1)
#print(prediction_SCF_X1)

The prediction by the MLE model on data X1:
[84.66669632483799, 108.27231462204306, 63.20704332737882, 153.3375859167073, 78.22880042560023]


In [204]:
from scipy.optimize import minimize
nvar=3
var=np.zeros(nvar)
var[0]=15
var[1]=2
var[2]=7

res2=minimize(MLE_model2,var,method="BFGS",options={"disp":False})
#print(res2)
theta_MLE_2=res2.x
prediction_MLE_X2=[theta_MLE_2[0]+theta_MLE_2[1]*X2[i][0] + theta_MLE_2[2]*X2[i][1] for i in range(len(X2))]
print("The prediction by the MLE model on data X2:")
print(prediction_MLE_X2)
#print(prediction_SCF_X2)

The prediction by the MLE model on data X2:
[89.12369757540722, 139.95881887154366, 40.55513605714182, 135.75064672772592, 82.32401824179405]


In [207]:
from scipy.optimize import minimize
nvar=4
var=np.zeros(nvar)
var[0]=15
var[1]=2
var[2]=7
var[3]=15

res3=minimize(MLE_model3,var,method="BFGS",options={"disp":False})
#print(res3.x)
theta_MLE_3=res3.x
prediction_MLE_X3=[theta_MLE_3[0]+theta_MLE_3[1]*X3[i][0] + theta_MLE_3[2]*X3[i][1] + theta_MLE_3[3]*X3[i][2] for i in range(len(X3))]
print("The prediction by the MLE model on data X3:")
print(prediction_MLE_X3)
#print(prediction_SCF_X3)


The prediction by the MLE model on data X3:
[74.44196909096411, 140.90710258407978, 40.612854903760805, 136.64124773400155, 95.10920587258315]


<b>MAP REGRESSION MODEL</b>

In [227]:
#This cell has the MAP models with the probability distribution due to the parameters added to the log likelihood calculated in the previos question.
#the distribution has mean zero and variance: Covariance matrix (Identity matrix multiplied by a constant)
def MAP_model1(var):
    n=len(var)
    x,y=np.array(df["Age"]),np.array(df["Salary"])
    yguess=np.array([(var[1]*(x[i])+var[0]) for i in range(len(x))])
    #The covariance matrix
    cov_mat=np.identity(n,dtype=int)*2
    var_dist=multivariate_normal.pdf(var,[0]*n,cov_mat)
    #Adding the log of the probability for that set of parameters
    f=calcLogLikelihood(yguess,y,float(len(yguess)))+np.log(var_dist)
    return (-1*f)

def MAP_model2(var):
    n=len(var)
    x1,x2,y=df["Age"],df["Degree"],df["Salary"]
    yguess=[0 for i in range(len(x1))]
    yguess=[(var[2]*x2[i]+var[1]*x1[i]+var[0]) for i in range(len(x1))]
    #The covariance matrix
    cov_mat=np.identity(n,dtype=int)*2
    var_dist=multivariate_normal.pdf(var,[0]*n,cov_mat)
    #Adding the log of the probability for that set of parameters
    f=calcLogLikelihood(yguess,y,float(len(yguess)))+np.log(var_dist)
    return (-1*f)

def MAP_model3(var):
    n=len(var)
    x1,x2,x3,y=df["Age"],df["Degree"],df["Gender_ID"],df["Salary"]
    yguess=[0 for i in range(len(x1))]
    yguess=[(var[3]*x3[i]+var[2]*x2[i]+var[1]*x1[i]+var[0]) for i in range(len(x1))]
    #The covariance matrix
    cov_mat=np.identity(n,dtype=int)*2
    var_dist=multivariate_normal.pdf(var,[0]*n,cov_mat)
    #Adding the log of the probability for that set of parameters
    f=calcLogLikelihood(yguess,y,float(len(yguess)))+np.log(var_dist)
    return (-1*f)

In [249]:
#The following three cells are the models for minimizing the loss and predicting the results.
from scipy.optimize import minimize
nvar=2
var=np.zeros(nvar)
var[0]=15
var[1]=2

res1=minimize(MAP_model1,var,method="BFGS",options={"disp":False})
#print(res1.x)
theta_MAP_1=res1.x
prediction_MAP_X1=[theta_MAP_1[0]+theta_MAP_1[1]*X1[i][0] for i in range(len(X1))]
print("The prediction by the MAP model on data X1:")
print(prediction_MAP_X1)
#print(prediction_SCF_X1)
#print(Y)

The prediction by the MAP model on data X1:
[80.13312234650918, 104.60033806061931, 57.89019897004543, 151.31047715119317, 73.46024533357006]


In [250]:
from scipy.optimize import minimize
nvar=3
var=np.zeros(nvar)
var[0]=15
var[1]=2
var[2]=7

res2=minimize(MAP_model2,var,method="BFGS",options={"disp":False})
#print(res2)
theta_MAP_2=res2.x
#print(res2.x)
prediction_MAP_X2=[theta_MAP_2[0]+theta_MAP_2[1]*X2[i][0] + theta_MAP_2[2]*X2[i][1] for i in range(len(X2))]
print("The prediction by the MAP model on data X2:")
print(prediction_MAP_X2)
#print(prediction_SCF_X2)
#print(Y)

The prediction by the MAP model on data X2:
[80.37904688323187, 105.0413413002471, 57.930814788700545, 150.92143233302474, 73.73685991578014]


In [251]:
from scipy.optimize import minimize
nvar=4
var=np.zeros(nvar)
var[0]=15
var[1]=2
var[2]=7
var[3]=15

res3=minimize(MAP_model3,var,method="BFGS",options={"disp":False})
#print(res3.x)
theta_MAP_3=res3.x
prediction_MAP_X3=[theta_MAP_3[0]+theta_MAP_3[1]*X3[i][0] + theta_MAP_3[2]*X3[i][1] + theta_MAP_3[3]*X3[i][2] for i in range(len(X3))]
print("The prediction by the MAP model on data X3:")
print(prediction_MAP_X3)
#print(prediction_SCF_X3)
#print(Y)


The prediction by the MAP model on data X3:
[80.40197240906491, 105.06118740061162, 57.89056513544832, 150.9351937137546, 73.69466977385676]


<b> THE SUMMARY </b>

In [234]:
y=[Y[i][0] for i in range(len(Y))]
print("The correct values of y, as given in the data are:",y)

The correct values of y, as given in the data are: [89.63317232669303, 123.9797275078149, 21.94943434658975, 138.37742423400192, 113.7726507571345]


In [232]:
print("When age is the only parameter:")
print("\t Predictions by the Straight curve fit model are:", prediction_SCF_X1)
print("\t Predictions by the MLE model are:", prediction_MLE_X1)
print("\t Predictions by the MAP model are:", prediction_MAP_X1)

When age is the only parameter:
	 Predictions by the Straight curve fit model are: [84.66668391655118, 108.2723134326932, 63.20702072005845, 153.33760614532792, 78.22878495760337]
	 Predictions by the MLE model are: [84.66669632483799, 108.27231462204306, 63.20704332737882, 153.3375859167073, 78.22880042560023]
	 Predictions by the MAP model are: [80.13312234650918, 104.60033806061931, 57.89019897004543, 151.31047715119317, 73.46024533357006]


In [235]:
print("When age, degree are the parameters:")
print("\t Predictions by the Straight curve fit model are:", prediction_SCF_X2)
print("\t Predictions by the MLE model are:", prediction_MLE_X2)
print("\t Predictions by the MAP model are:", prediction_MAP_X2)

When age, Degree are the parameters:
	 Predictions by the Straight curve fit model are: [89.12372023206069, 139.95875451124527, 40.555243081927685, 135.7506425020944, 82.32404884490593]
	 Predictions by the MLE model are: [89.12369757540722, 139.95881887154366, 40.55513605714182, 135.75064672772592, 82.32401824179405]
	 Predictions by the MAP model are: [80.37904688323187, 105.0413413002471, 57.930814788700545, 150.92143233302474, 73.73685991578014]


In [236]:
print("When age, degree and gender are the parameters:")
print("\t Predictions by the Straight curve fit model are:", prediction_SCF_X3)
print("\t Predictions by the MLE model are:", prediction_MLE_X3)
print("\t Predictions by the MAP model are:", prediction_MAP_X3)

When age, degree and gender are the parameters:
	 Predictions by the Straight curve fit model are: [74.44198221172991, 140.9070536359166, 40.612896487830184, 136.6412882208632, 95.10918861589408]
	 Predictions by the MLE model are: [74.44196909096411, 140.90710258407978, 40.612854903760805, 136.64124773400155, 95.10920587258315]
	 Predictions by the MAP model are: [80.40197240906491, 105.06118740061162, 57.89056513544832, 150.9351937137546, 73.69466977385676]


The predictions for the MLE and the straight curve fitting model are the same. MAP models give slightly different answers. The predictions for the MAP models also depend on the value of the lambda for the covariance matrix. 

In [246]:
sq_error_MAP_X1=[(prediction_MAP_X1[i]-y[i])**2 for i in range(len(y))]
mean_sq_error_MAP_X1=(sum(sq_error_MAP_X1)/len(y))
print("The Mean squared error for MAP Model using data X1 is: ", mean_sq_error_MAP_X1)
sq_error_MLE_X1=[(prediction_MLE_X1[i]-y[i])**2 for i in range(len(y))]
mean_sq_error_MLE_X1=(sum(sq_error_MLE_X1)/len(y))
print("The Mean squared error for MLE Model using data X1 is: ", mean_sq_error_MLE_X1)

The Mean squared error for MAP Model using data X1 is:  709.9808270966814
The Mean squared error for MLE Model using data X1 is:  692.1501472431992


In [247]:
sq_error_MAP_X2=[(prediction_MAP_X2[i]-y[i])**2 for i in range(len(y))]
mean_sq_error_MAP_X2=(sum(sq_error_MAP_X2)/len(y))
print("The Mean squared error for MAP Model using data X2 is: ", mean_sq_error_MAP_X2)
sq_error_MLE_X2=[(prediction_MLE_X2[i]-y[i])**2 for i in range(len(y))]
mean_sq_error_MLE_X2=(sum(sq_error_MLE_X2)/len(y))
print("The Mean squared error for MLE Model using data X2 is: ", mean_sq_error_MLE_X2)

The Mean squared error for MAP Model using data X2 is:  699.8355471742492
The Mean squared error for MLE Model using data X2 is:  319.53590172540703


In [248]:
sq_error_MAP_X3=[(prediction_MAP_X3[i]-y[i])**2 for i in range(len(y))]
mean_sq_error_MAP_X3=(sum(sq_error_MAP_X3)/len(y))
print("The Mean squared error for MAP Model using data X3 is: ", mean_sq_error_MAP_X3)
sq_error_MLE_X3=[(prediction_MLE_X3[i]-y[i])**2 for i in range(len(y))]
mean_sq_error_MLE_X3=(sum(sq_error_MLE_X3)/len(y))
print("The Mean squared error for MLE Model using data X3 is: ", mean_sq_error_MLE_X3)

The Mean squared error for MAP Model using data X3 is:  699.7666456182988
The Mean squared error for MLE Model using data X3 is:  243.3940866826898


<b> CONCLUSION </b>

The MLE and Straight Curve fitting models give almost the same result. 
The MAP model, on the other hand, gives a different prediction. 
The results from the MAP model also depend on the value of lambda in the covariance matrix. 
While the MAP model is expected to perform better, in this case, the MLE gives a lower loss and therefore a better result. 
This could be due to the fact that the function does not have a zero-one loss on the function. 