In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [20]:
#reading in my csv data and looking at the initial first 5 rows
data_frame = pd.read_csv("dataset.csv")
data_frame.head()

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308,599.406092


In [21]:
#getting the length of my data frame
print(len(data_frame))

500


In [22]:
#drop the email,address and avatar columns, not helpuf in the linear regression model
data_frame.drop(columns=["Email","Address","Avatar"], inplace= True) 
data_frame.head()

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,34.497268,12.655651,39.577668,4.082621,587.951054
1,31.926272,11.109461,37.268959,2.664034,392.204933
2,33.000915,11.330278,37.110597,4.104543,487.547505
3,34.305557,13.717514,36.721283,3.120179,581.852344
4,33.330673,12.795189,37.536653,4.446308,599.406092


In [23]:
#change all the column values to numeric, we are working with a linear regression model,numeric data is important
data_frame.apply(pd.to_numeric, errors="coerce")
#drop rows with missing values
data_frame.dropna(inplace=True) # setting inplace to be true means i am modifying the original data frame, changing the original dataset
#drop duplicates in the data frame
data_frame.drop_duplicates(inplace=True)
print(len(data_frame))

500


In [24]:
#getting my features and my target column
features = data_frame.iloc[:, :4].values # getting all the rows and column 0 to 3
target_column = data_frame.iloc[:, 4].values # getting all the rows and column 4
print(target_column)

[587.95105397 392.20493344 487.54750487 581.85234404 599.40609205
 637.10244792 521.57217476 549.90414611 570.20040896 427.1993849
 492.60601272 522.33740461 408.64035107 573.41586733 470.4527333
 461.7807422  457.84769594 407.70454755 452.31567548 605.0610388
 534.70574381 419.93877484 436.51560573 519.34098913 700.91709162
 423.17999168 619.89563986 486.83893477 529.53766534 554.72208383
 497.5866713  447.68790654 588.71260551 491.07322368 507.44183234
 521.88357317 347.77692663 490.73863214 478.17033405 537.84619527
 532.75178758 501.87443028 591.19717818 547.24434342 448.22982919
 549.86059046 593.91500297 563.67287336 479.73194909 416.35835358
 725.58481406 442.66725174 384.62657157 451.45744687 522.40414126
 483.67330802 520.89879445 453.16950235 496.65070807 547.36514059
 616.85152297 507.212569   613.59932337 483.15972078 540.26340041
 765.51846194 553.60153468 469.3108615  408.62018783 451.57568516
 444.96655165 595.8228367  418.1500811  534.7771881  578.24160506
 478.71935687

In [27]:
# splitting the dataset into training and testing data using scikit learn
# i will be using the 80-20 split ratio 
# random_state = 1 is here to ensure that the split is always the same,produces the same results each time
feature_training,feature_testing,target_training,target_testing = train_test_split(features,target_column,test_size=0.2, random_state=1)

#outputting the length of the training and the testing to see if the 80-20 ratio is correctly applied
print(target_training.shape[0])
print(target_testing.shape[0])

400
100


In [34]:
def closed_form_solution(feature_Matrix,target_Vector):
    theta = np.linalg.inv(feature_Matrix.T.dot(feature_Matrix)).dot(feature_Matrix.T).dot(target_Vector)
    return theta

In [29]:
print(feature_training)

[[31.26064687 13.26676035 36.9711951   2.26725111]
 [32.14906052 10.04731474 37.18144731  3.53508843]
 [33.81173341 11.18680872 36.29889308  4.3019965 ]
 ...
 [35.74266981 10.88982828 35.56543624  6.11519895]
 [33.17720467 11.62277717 35.96889569  3.63409373]
 [32.86532717 11.98441752 37.0443614   3.45238858]]


In [31]:
# add a bias term { 1 } to our feature matrix inorder to be able to use the closed form solution so i am adding a new column of 1s
feature_training_bias = np.c_[np.ones((feature_training.shape[0],1)),feature_training]
print(feature_training_bias)

[[ 1.         31.26064687 13.26676035 36.9711951   2.26725111]
 [ 1.         32.14906052 10.04731474 37.18144731  3.53508843]
 [ 1.         33.81173341 11.18680872 36.29889308  4.3019965 ]
 ...
 [ 1.         35.74266981 10.88982828 35.56543624  6.11519895]
 [ 1.         33.17720467 11.62277717 35.96889569  3.63409373]
 [ 1.         32.86532717 11.98441752 37.0443614   3.45238858]]


In [36]:
# get our theta values
theta = closed_form_solution(feature_training_bias,target_training)
print(theta)

[-1.04773921e+03  2.57885426e+01  3.88515047e+01  2.56384674e-01
  6.14920499e+01]


In [37]:
# now i am testing the model, and i have to a column of 1s to my testing data
feature_testing_bias = np.c_[np.ones((feature_testing.shape[0],1)),feature_testing]

# and finally to test, i just get the dot product of the the feature_testing_bias with the theta vector i got
tested = feature_testing_bias.dot(theta)
print(tested)

[498.7228681  519.59228344 563.13904069 478.91915652 423.41266838
 508.9555034  478.76101633 460.0534163  519.38968483 438.96457911
 487.62519476 525.92536473 577.18547575 553.7066461  685.60452684
 473.39014425 314.1945817  458.030198   452.4612251  461.03168755
 456.28708189 332.06929726 504.03498428 401.56582625 766.63520004
 426.08349764 513.37904373 476.9128933  306.33083848 590.22648801
 505.28052114 389.98117113 463.78603542 315.72068062 465.83129961
 549.68040427 620.1610288  591.95902606 495.80365357 402.73437857
 507.05480493 547.73248629 422.68457762 558.79136007 442.02693944
 432.71267857 430.23835177 507.05828181 425.57805969 401.99227084
 553.16953956 353.36551475 639.65600563 478.48345842 452.01425715
 621.7991521  325.77117572 491.50118041 411.66626102 549.18504412
 587.96534365 548.30580216 567.82460456 589.07491155 424.81143949
 581.71582004 601.91487334 474.32407438 444.68380443 539.9210953
 479.48959778 573.69657395 478.43264187 572.0490947  390.0305955
 398.1973499

In [38]:
mse_test = np.mean((tested - target_testing) ** 2)
print(mse_test)

78.29571004007693
