In [586]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, output_file, show

In [587]:
train_df = pd.read_csv("train.csv")
df2 = pd.DataFrame([[45, 100],[90,10],[90,10],[80,15],[95,10],[90,15],[75,10],[95,20],[40,90],[60,20]],columns=['x','y'])
train_df = train_df.append(df2)
train_df.dropna(inplace=True,axis=0,how='any')
#train_df.y.fillna(train_df.y.max(),inplace=True)
Y_train = train_df.y
X = train_df.x
m = Y_train.shape[0]
print("Y shape is : " , m)
train_df.head()

Y shape is :  709


Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


In [588]:
#Plotting data
output_file("plot.html")
p = figure()
p.sizing_mode = 'scale_width'
p.circle(train_df.x,train_df.y,size=10,color='red',alpha=0.5)
show(p)

In [589]:
def computeCost(X,y,theta):
    size = y.shape[0]
    J = (1/(2*size)) * np.sum((np.matmul(X,theta) - y)**2)
    return J

In [590]:
def gradientDescent(X,y,theta,alpha,num_iter):
    print("initial theta shape is : ", theta)
    j = []
    #print("X contains null : " , np.isnan(Y).any())
    for i in range(num_iter):
        error = (np.dot(X,theta) - y)
        #print(np.isnan(error).any())
        theta = theta - ((alpha)*(np.dot(error,X)/y.shape[0]))
        j.append(computeCost(X,y,theta))
    print("theta is : ", theta)
        
    print("Cost is : " , computeCost(X,y,theta))
    return theta,j

In [591]:
X = np.transpose(X)

In [592]:
X = np.c_[np.ones(m),X]

In [593]:
print(X.shape)
print(np.sum(X))
print(np.isnan(Y_train).any())

(709, 2)
36429.0
False


In [594]:
theta = np.ones(X.shape[1])
iterations = 100000
alphas = [0.0005]
print(theta.shape)

(2,)


In [595]:
computeCost(X,Y_train,theta)

38.14298112953331

In [596]:
J_dic = {}
theta_dic = {}
for i,alpha in enumerate(alphas):
    theta = np.ones(X.shape[1])
    theta,J = gradientDescent(X,Y_train,theta,alpha,iterations)
    J_dic[alpha] = J
    theta_dic[alpha] = theta

initial theta shape is :  [1. 1.]
theta is :  [1.03462365 0.96512935]
Cost is :  36.1492102891318


In [597]:
#Plotting J against iterations data
output_file("Jplot.html")
p1 = figure()
p1.sizing_mode = 'scale_width'
for color, alpha in zip(['red','green','blue','yellow','purple'],alphas):
    p1.line(list(range(1,iterations+1)),J_dic[alpha],line_width=2,color=color,
            alpha=0.8,muted_color=color, muted_alpha=0.2,legend=str(alpha))
p1.xaxis.axis_label = "Iterations"
p1.yaxis.axis_label = "J"
p1.legend.location = "top_right"
p1.legend.click_policy="mute"
show(p1)

In [598]:
test_df = pd.read_csv("test.csv")
test_df.loc[test_df.y.isnull()]
X_test = np.transpose(test_df.x)
m_test = test_df.y.shape[0]
X_test = np.c_[np.ones(m_test),X_test]
predict = np.dot(X_test,theta_dic[0.0005])
correct = [(i,j) for i, j in zip(test_df.y.values,predict)]
print(correct)

[(79.77515201, 75.34958354601447), (23.17727887, 21.302339987197406), (25.60926156, 22.267469336461996), (17.85738813, 20.337210637932817), (41.84986439, 35.779280226166264), (9.805234876, 15.511563891609864), (58.87465933, 60.87264330704562), (97.61793701, 92.7219118327771), (18.39512747, 20.337210637932817), (8.746747654, 5.860270398963959), (2.811415826, 4.895141049699368), (17.09537241, 19.372081288668227), (95.14907176, 93.68704118204168), (61.38800663, 60.87264330704562), (40.24701716, 35.779280226166264), (14.82248589, 15.511563891609864), (66.95806869, 63.768031354839394), (16.63507984, 14.546434542345272), (90.65513736, 85.00087703866038), (77.22982636, 67.62854875189774), (92.11906278, 86.93113573718955), (46.91387709, 50.256220465135115), (89.82634442, 86.93113573718955), (21.71380347, 27.09311608278495), (97.41206981, 94.65217053130628), (57.01631363, 57.01212590998725), (78.31056542, 77.27984224454364), (19.1315097, 21.302339987197406), (93.03483388, 90.79165313424792), (2

In [599]:
def slope_intercept(x,y):
    slope = round(theta_dic[0.0005][1],2)
    b = np.mean(y) - np.dot(np.mean(x),slope)
    b = round(theta_dic[0.0005][0],2)
    return slope,b

In [600]:
slope,b = slope_intercept(X, Y_train)

In [601]:
reg_line = [np.dot(slope,x) + b for x in train_df.x]

In [602]:
#Plotting data
output_file("plot2.html")
p2 = figure()
p2.sizing_mode = 'scale_width'
p2.circle(train_df.x,train_df.y,size=10,color='red',alpha=0.5)
p2.line(train_df.x,reg_line)
show(p2)