In [16]:
#subject7
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
import pandas as pd

In [17]:
X, Y = load_boston().data, load_boston().target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=True, random_state=0)

In [18]:
knnr = KNeighborsRegressor(n_neighbors=3)
knnr.fit(X_train, Y_train)
print(f"train accuracy:{knnr.score(X_train, Y_train)}")
print(f"test accuracy:{knnr.score(X_test, Y_test)}")

train accuracy:0.7779302460306765
test accuracy:0.5545587353916865


In [19]:
cv = KFold(n_splits=3, shuffle=True, random_state=0)
param = {
    "n_neighbors": list(range(1, 11, 1))
}

gridcv = GridSearchCV(knnr, param_grid=param, n_jobs=-1, cv=cv)
gridcv.fit(X_train, Y_train)
print(f"score: {gridcv.best_params_}")

score: {'n_neighbors': 5}


In [20]:
knnr = KNeighborsRegressor(**gridcv.best_params_)
knnr.fit(X_train, Y_train)
print(f"optimized train accuracy:{knnr.score(X_train, Y_train)}")
print(f"optimized test accuracy:{knnr.score(X_test, Y_test)}")

optimized train accuracy:0.6904639276249327
optimized test accuracy:0.5166480241894266


In [21]:
diabetes = load_diabetes()
X, Y = diabetes.data, diabetes.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=True, random_state=0)
df = pd.DataFrame(data=X, columns=diabetes.feature_names)
print(diabetes.DESCR)
#age age in years
#sex
#bmi body mass index
#bp average blood pressure
#s1 tc, total serum cholesterol
#s2 ldl, low-density lipoproteins
#s3 hdl, high-density lipoproteins
#s4 tch, total cholesterol / HDL
#s5 ltg, possibly log of serum triglycerides level
#s6 glu, blood sugar level
df

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bra

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [22]:
knnr = KNeighborsRegressor(n_neighbors=3)
knnr.fit(X_train, Y_train)
print(f"train accuracy:{knnr.score(X_train, Y_train)}")
print(f"test accuracy:{knnr.score(X_test, Y_test)}")

train accuracy:0.6974616776390717
test accuracy:0.20606948137763026


In [23]:
cv = KFold(n_splits=3, shuffle=True, random_state=0)
param = {
    "n_neighbors": list(range(1, 11, 1))
}

gridcv = GridSearchCV(knnr, param_grid=param, n_jobs=-1, cv=cv)
gridcv.fit(X_train, Y_train)
print(f"score: {gridcv.best_params_}")

score: {'n_neighbors': 10}


In [24]:
knnr = KNeighborsRegressor(**gridcv.best_params_)
knnr.fit(X_train, Y_train)
print(f"optimized train accuracy:{knnr.score(X_train, Y_train)}")
print(f"optimized test accuracy:{knnr.score(X_test, Y_test)}")

optimized train accuracy:0.5625711472015148
optimized test accuracy:0.3081655505021582


In [28]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, max_depth=5)
rfr.fit(X_train, Y_train)
print(f"train accuracy:{rfr.score(X_train, Y_train)}")
print(f"test accuracy:{rfr.score(X_test, Y_test)}")

train accuracy:0.7712237142089345
test accuracy:0.3312601398192775


In [26]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
print(f"train accuracy:{lr.score(X_train, Y_train)}")
print(f"test accuracy:{lr.score(X_test, Y_test)}")

train accuracy:0.5539411781927148
test accuracy:0.39289398450747565


In [27]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, Y_train)
print(f"train accuracy:{svr.score(X_train, Y_train)}")
print(f"test accuracy:{svr.score(X_test, Y_test)}")
cv = KFold(n_splits=3, shuffle=True, random_state=0)
param = {
    'C': [1, 10, 100],
    'gamma': ["scale",0.01, 0.1, 0.2, 0.5, 1.0],
    'epsilon': [0.01, 0.1, 0.2]
}

gridcv = GridSearchCV(svr, param_grid=param, n_jobs=-1, cv=cv)
gridcv.fit(X_train, Y_train)
print(f"score: {gridcv.best_params_}")
svr = SVR()
svr.fit(X_train, Y_train)
print(f"train accuracy:{svr.score(X_train, Y_train)}")
print(f"test accuracy:{svr.score(X_test, Y_test)}")

train accuracy:0.1632964275114065
test accuracy:0.1369885101142122
score: {'C': 100, 'epsilon': 0.01, 'gamma': 1.0}
train accuracy:0.1632964275114065
test accuracy:0.1369885101142122
