## http://scikit-learn.org/stable/modules/linear_model.html

## Include dependencies

In [176]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn import linear_model

## Load the dataset

In [177]:
diabetes = datasets.load_diabetes()

## Understand the dataset

In [None]:
print("Input dataset")
print(diabetes.data)
print("\n\nShape of data:\n " + str(diabetes.data.shape) + "\n\n")
print("Features:\n " + str(diabetes.feature_names) + "\n\n")
print("Target:\n " + str(diabetes.target.view()) + "\n\n")
print("Target length:\n " + str(len(diabetes.target)))

## Select 1 variable only for modeling

In [None]:
diabetes_X = diabetes.data[:, np.newaxis, 4]
print("Shape of input dataset: " + str(diabetes_X.shape))

## Prepare to Model

In [None]:
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
print("train size: " + str(diabetes_X_train.shape))
print("test size: " + str(diabetes_X_test.shape))

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

## Trained model

In [None]:
print (regr)
print ("Co-efficient: (m) " + str(regr.coef_))
print ("Intercept: (c) " + str(regr.intercept_))
print ("Line equation: y=m*x+c")

## Predict the test set

In [155]:
diabetes_y_pred = regr.predict(diabetes_X_test)

In [None]:
print (len(diabetes_y_pred))
print(diabetes_y_pred)

## How good is our model?

In [None]:
print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
print('Coefficient of determination (R^2): %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

## Interpret and plot

In [None]:
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

## Can we do better? Let's use a little complicated model (Ridge Regression)

http://scikit-learn.org/stable/modules/linear_model.html#ridge-regression

In [159]:
reg = linear_model.Ridge (alpha = .5)
reg.fit(diabetes_X_train, diabetes_y_train);

In [None]:
print (reg)
print ("")
print ("Co-efficient: (m) " + str(reg.coef_))
print ("Intercept: (c) " + str(reg.intercept_))
print ("Line equation: y=m*x+c")

In [None]:
ridge_y_pred = regr.predict(diabetes_X_test)
print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, ridge_y_pred))
print('Coefficient of determination (R^2): %.2f' % r2_score(diabetes_y_test, ridge_y_pred))

## It did not make any difference.
### Time to add more features

In [None]:
diabetes = datasets.load_diabetes()
diabetes_X = diabetes.data
print("Shape of input dataset: " + str(diabetes_X.shape))

In [None]:
X_train = diabetes_X[:-20]
X_test = diabetes_X[-20:]
print("train size: " + str(X_train.shape))
print("test size: " + str(X_test.shape))

# Split the targets into training/testing sets
y_train = diabetes.target[:-20]
y_test = diabetes.target[-20:]

In [None]:
reg = linear_model.LinearRegression()

# Train the model using the training sets
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print (reg)
print ("")
print ("Co-efficient: (m) " + str(reg.coef_))
print ("Intercept: (c) " + str(reg.intercept_))
print ("Line equation: y=m*x+c")
print ("")
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2): %.2f' % r2_score(y_test, y_pred))


## More complicated models
https://xgboost.readthedocs.io/en/latest/tutorials/model.html