# We will apply 'Linear Regression with Gradient Descent' to predict the progression of Diabetes in patients.

In [1]:
import numpy as np
from sklearn import datasets,linear_model, metrics

In [2]:
#Load the diabetes dataset.
diabetes = datasets.load_diabetes()
diabetes_X = diabetes.data #matrix of dimension 442x10
print(diabetes_X.shape)
print(list(diabetes))
print(diabetes.DESCR)
print(diabetes.feature_names)

(442, 10)
['data', 'target', 'DESCR', 'feature_names']
Diabetes dataset

Notes
-----

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

Data Set Characteristics:

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attributes:
    :Age:
    :Sex:
    :Body mass index:
    :Average blood pressure:
    :S1:
    :S2:
    :S3:
    :S4:
    :S5:
    :S6:

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bradley Efr

In [3]:
print(diabetes_X)
print("="*120)

[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952 -0.04687948
   0.01549073]
 [-0.04547248 -0.04464164  0.03906215 ...  0.02655962  0.04452837
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338 -0.00421986
   0.00306441]]


In [4]:
# Split the data into training and testing datasets 
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

In [5]:
print(diabetes_X_train)
print("="*120)

[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [-0.02004471 -0.04464164 -0.0547075  ... -0.03949338 -0.07408887
  -0.0052198 ]
 [ 0.02354575 -0.04464164 -0.03638469 ...  0.03430886 -0.03324879
   0.06105391]
 [ 0.03807591  0.05068012  0.0164281  ...  0.07120998  0.04976866
   0.01549073]]


In [6]:
print(diabetes_X_test)
print("="*120)

[[-0.07816532  0.05068012  0.07786339  0.05285819  0.07823631  0.0644473
   0.02655027 -0.00259226  0.04067226 -0.00936191]
 [ 0.0090156   0.05068012 -0.03961813  0.0287581   0.03833367  0.0735286
  -0.07285395  0.1081111   0.01556684 -0.04664087]
 [ 0.00175052  0.05068012  0.01103904 -0.01944209 -0.01670444 -0.00381907
  -0.04708248  0.03430886  0.02405258  0.02377494]
 [-0.07816532 -0.04464164 -0.04069594 -0.08141377 -0.10063757 -0.11279473
   0.02286863 -0.0763945  -0.02028875 -0.05078298]
 [ 0.03081083  0.05068012 -0.03422907  0.0436772   0.05759701  0.06883138
  -0.03235593  0.05755657  0.03546194  0.08590655]
 [-0.03457486  0.05068012  0.00564998 -0.00567061 -0.07311851 -0.06269098
  -0.00658447 -0.03949338 -0.04542096  0.03205916]
 [ 0.04897352  0.05068012  0.08864151  0.0872869   0.03558177  0.02154596
  -0.02499266  0.03430886  0.06604821  0.13146972]
 [-0.04183994 -0.04464164 -0.03315126 -0.02288496  0.04658939  0.04158746
   0.05600338 -0.02473293 -0.02595242 -0.03835666]
 [

In [7]:
# Split the Targets into training and testing data sets
diabetes_Y_train = diabetes.target[:-20]
diabetes_Y_test = diabetes.target[-20:]

In [8]:
print(diabetes_Y_train)
print("="*120)

[151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  81. 151. 107.  64.
 138. 185. 265. 101. 137. 143. 141.  79. 292. 178.  91. 116.  86. 122.
  72. 

In [9]:
print(diabetes_Y_test)
print("="*120)

[233.  91. 111. 152. 120.  67. 310.  94. 183.  66. 173.  72.  49.  64.
  48. 178. 104. 132. 220.  57.]


In [10]:
###### Scikit Learn

In [11]:
# Create an object of linear regression
regr = linear_model.LinearRegression()

In [12]:
# Train the model with training datasets.
regr.fit(diabetes_X_train,diabetes_Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
# Make predictions using the testing set.
diabetes_Y_prediction = regr.predict(diabetes_X_test)
print(diabetes_Y_prediction)
print("="*120)

[197.61846908 155.43979328 172.88665147 111.53537279 164.80054784
 131.06954875 259.12237761 100.47935157 117.0601052  124.30503555
 218.36632793  61.19831284 132.25046751 120.3332925   52.54458691
 194.03798088 102.57139702 123.56604987 211.0346317   52.60335674]


In [14]:
# The coefficients
print("Coefficients: \n",regr.coef_)
print("="*120)

Coefficients: 
 [ 3.03499549e-01 -2.37639315e+02  5.10530605e+02  3.27736980e+02
 -8.14131709e+02  4.92814588e+02  1.02848452e+02  1.84606489e+02
  7.43519617e+02  7.60951722e+01]


In [15]:
# Mean Squared error
mean_squared_error = metrics.mean_squared_error(diabetes_Y_test, diabetes_Y_prediction)
print("Mean squared error: %.2f" % mean_squared_error)
print("="*120)

Mean squared error: 2004.57


In [16]:
###### Our own implementation.
# train datasets
X = diabetes_X_train
Y = diabetes_Y_train
# train : initialization of weight vector and constant term
w = np.random.uniform(low = -0.1, high = 1.0, size=diabetes_X.shape[1])
b = 0.0
print(w.shape)
learning_rate = 0.1
iteration = 100000

# Train : Gradient Descent

for i in range(iteration):
    #calculate predictions
    Y_prediction = X.dot(w) + b
    
    # calculate error and cost(mean square error)
    error = Y - Y_prediction
    cost = np.mean(np.power(error, 2))
    
    #calculate gradients
    w_grad = (-1.0/len(X)) * error.dot(X)
    b_grad = (-1.0/len(X)) * np.sum(error)
    
    # Update Parameters
    w = w - (learning_rate * w_grad)
    b = b - (learning_rate * b_grad)
    
    # output between iteration
    if i%5000==0:
        print("Iteration %d: %f" %(i,cost))
print("="*120)

(10,)
Iteration 0: 29457.766322
Iteration 5000: 3048.215994
Iteration 10000: 2941.405937
Iteration 15000: 2927.438598
Iteration 20000: 2924.730065
Iteration 25000: 2923.771511
Iteration 30000: 2923.171412
Iteration 35000: 2922.670053
Iteration 40000: 2922.206943
Iteration 45000: 2921.765227
Iteration 50000: 2921.339276
Iteration 55000: 2920.926691
Iteration 60000: 2920.526121
Iteration 65000: 2920.136610
Iteration 70000: 2919.757401
Iteration 75000: 2919.387856
Iteration 80000: 2919.027426
Iteration 85000: 2918.675632
Iteration 90000: 2918.332056
Iteration 95000: 2917.996325


In [17]:
# Test on Test data sets
X = diabetes_X_test
Y = diabetes_Y_test

# Calculate predictions + calculate error and cost(same code as above)
Y_prediction = X.dot(w) + b
error = Y - Y_prediction
cost = np.mean(np.power(error, 2))

print("Coefficients: \n", w)
print("Mean Squared error: %.2f" %cost)
print("="*120)
print(Y)
print("="*120)
print(Y_prediction)
print("="*120)

Coefficients: 
 [   3.66006742 -234.66640652  519.39196825  325.58468095 -176.51757693
  -16.13796656 -179.84702279  108.16019983  502.92563023   78.96710669]
Mean Squared error: 1993.53
[233.  91. 111. 152. 120.  67. 310.  94. 183.  66. 173.  72.  49.  64.
  48. 178. 104. 132. 220.  57.]
[198.01150706 150.64326026 169.40862326 108.26928335 162.77618783
 132.31680964 260.65034248 100.65881006 115.12043393 121.49015083
 218.64795611  60.79988552 135.46325478 121.31493916  53.84695323
 192.83267373 103.81217522 126.11162785 208.97176938  45.95723966]


In [18]:
# Calculate percentage error 
Error = (mean_squared_error - cost)
Error = (Error/1993.53) * 100
print("Error Percentage of sklearn and own implentation: %.3f"%Error)

Error Percentage of sklearn and own implentation: 0.554
