## Regression

In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [3]:
!ls ./dataset_0914/

[31mBostonHousing.csv[m[m             [31miris.csv[m[m
[31mBostonHousing_description.txt[m[m [31mliver.csv[m[m
[31mPimaIndiansDiabetes.csv[m[m       [31mprestige.csv[m[m
[31mcars.csv[m[m                      [31mucla_admit.csv[m[m
[31mcredit.csv[m[m


In [4]:
path = './dataset_0914'

In [5]:
boston_hs = pd.read_csv(f"{path}/BostonHousing.csv")
boston_hs

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [6]:
lstat = np.array(boston_hs['lstat']).reshape(506, 1)
medv = np.array(boston_hs['medv']).reshape(506, 1)

model = LinearRegression()

model.fit(lstat, medv)

test_predict = model.predict([[2.0], [3.0], [4.0], [5.0]])
test_predict

array([[32.65374217],
       [31.70369282],
       [30.75364346],
       [29.80359411]])

In [7]:
print(f"medv = {float(model.coef_)} * lstat + {float(model.intercept_)}")

medv = -0.9500493537579906 * lstat + 34.5538408793831


In [8]:
pred_y = model.predict(lstat)
print(mean_squared_error(medv, pred_y))

38.48296722989415


In [9]:
train_X, test_X, train_y, test_y = train_test_split(lstat, medv, test_size=0.2, random_state=20)

In [10]:
model = LinearRegression()

model.fit(train_X, train_y)

pred_y = model.predict(test_X)
pred_y

array([[22.44474592],
       [24.69597798],
       [21.30463698],
       [14.19344902],
       [12.42531397],
       [26.94721004],
       [12.61855277],
       [ 5.37209765],
       [22.32880264],
       [24.41578171],
       [25.79743916],
       [17.92295792],
       [28.98587941],
       [23.3529683 ],
       [25.45927125],
       [29.35303314],
       [27.34334958],
       [17.5074945 ],
       [17.20797435],
       [20.75390639],
       [31.90378534],
       [32.14533384],
       [26.41580333],
       [26.60904213],
       [21.87469145],
       [29.15013239],
       [25.014822  ],
       [28.29021972],
       [22.62832278],
       [28.58007792],
       [15.81665497],
       [21.27565116],
       [21.140384  ],
       [ 6.86003644],
       [29.7781585 ],
       [24.65733022],
       [29.98105925],
       [29.4399906 ],
       [24.7249638 ],
       [31.03421073],
       [12.37700427],
       [16.80217287],
       [18.29977359],
       [19.33360119],
       [25.74912946],
       [25

In [11]:
X = boston_hs[['lstat', 'ptratio', 'tax', 'rad']]
y = boston_hs['medv']

model = LinearRegression()

model.fit(X, y)


print(f"Weight: {model.coef_}")
print(f"Bias: {model.intercept_}")

Weight: [-0.81405588 -1.23038839 -0.01512154  0.33174887]
Bias: 58.545748661239884


In [12]:
test_X = [[2.0, 14, 296, 1], [3.0, 15, 222, 2], [4.0, 15, 250, 3]]

test_y = model.predict(test_X)
test_y

array([35.5479738 , 34.95427204, 34.04856204])

In [13]:
test_y = model.predict(X)
print(mean_squared_error(y, test_y))

31.801263706719894


---

## UCLA_admit

In [14]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [15]:
ucla_data = pd.read_csv(f"{path}/ucla_admit.csv")

ucla_X = ucla_data[['gre', 'gpa', 'rank']]
ucla_y = ucla_data['admit']

trn_X, test_X, trn_y, test_y = train_test_split(ucla_X, ucla_y, test_size=0.3, random_state=1234)

model = LogisticRegression()
model.fit(trn_X, trn_y)

LogisticRegression()

In [16]:
print(accuracy_score(trn_y, model.predict(trn_X)))
print(accuracy_score(test_y, model.predict(test_X)))

0.6714285714285714
0.7416666666666667


In [17]:
X = [[400, 3.5, 5], [550, 3.8, 2], [700, 4.0, 2]]
predict = model.predict(X)
predict

array([0, 0, 0])

In [18]:
ucla_X = ucla_data[['gre', 'gpa']]
ucla_y = ucla_data['admit']

trn_X, test_X, trn_y, test_y = train_test_split(ucla_X, ucla_y, test_size=0.3, random_state=1234)

model = LogisticRegression()
model.fit(trn_X, trn_y)

LogisticRegression()

In [19]:
print(accuracy_score(trn_y, model.predict(trn_X)))
print(accuracy_score(test_y, model.predict(test_X)))

0.625
0.825


---

## KNN, Classification

In [20]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

In [21]:
path = './dataset_0914'
boston_hs = pd.read_csv(f"{path}/BostonHousing.csv")

In [22]:
bs_data = boston_hs[['indus', 'dis', 'medv']]
bs_data

Unnamed: 0,indus,dis,medv
0,2.31,4.0900,24.0
1,7.07,4.9671,21.6
2,7.07,4.9671,34.7
3,2.18,6.0622,33.4
4,2.18,6.0622,36.2
...,...,...,...
501,11.93,2.4786,22.4
502,11.93,2.2875,20.6
503,11.93,2.1675,23.9
504,11.93,2.3889,22.0


In [23]:
scaler = StandardScaler()
scaler.fit(bs_data)
BH = scaler.transform(bs_data)
BH[:5]

array([[-1.2879095 ,  0.1402136 ,  0.15968566],
       [-0.59338101,  0.55715988, -0.10152429],
       [-0.59338101,  0.55715988,  1.32424667],
       [-1.30687771,  1.07773662,  1.18275795],
       [-1.30687771,  1.07773662,  1.48750288]])

In [24]:
from sklearn.cluster import KMeans

BH_data = BH[:500]
BH_kmeans = KMeans(n_clusters=5, random_state=123).fit(BH_data)
BH_data = np.hstack((BH_data, BH_kmeans.labels_.reshape(-1, 1)))
BH_data

array([[-1.2879095 ,  0.1402136 ,  0.15968566,  2.        ],
       [-0.59338101,  0.55715988, -0.10152429,  2.        ],
       [-0.59338101,  0.55715988,  1.32424667,  1.        ],
       ...,
       [-0.21109853, -0.42894587, -0.46068796,  2.        ],
       [-0.21109853, -0.65883438, -0.14505928,  2.        ],
       [-0.21109853, -0.66320778, -0.54775795,  2.        ]])

In [25]:
# Center of cluseters
BH_kmeans.cluster_centers_

array([[ 1.17486908, -0.83795852, -0.7256677 ],
       [-1.04746396,  0.03299445,  1.61437165],
       [-0.40576204,  0.08002824, -0.12606471],
       [ 1.12397199, -1.0298077 ,  2.89477147],
       [-1.01177187,  1.71875715,  0.16656619]])

In [26]:
BH_test = BH[-5:]
pr = BH_kmeans.predict(BH_test)
BH_test = np.hstack((BH_test, pr.reshape(-1, 1)))
BH_test

array([[ 0.11573841, -0.62579623, -0.01445431,  2.        ],
       [ 0.11573841, -0.71663927, -0.21036176,  2.        ],
       [ 0.11573841, -0.77368357,  0.14880191,  2.        ],
       [ 0.11573841, -0.66843684, -0.0579893 ,  2.        ],
       [ 0.11573841, -0.61324648, -1.15724782,  0.        ]])

In [27]:
BH_before_scale = bs_data[:500]
BH_before_scale_label = pd.DataFrame({"label": BH_kmeans.labels_})
BH_before_scale = pd.concat([BH_before_scale, BH_before_scale_label], axis=1)
BH_before_scale
BH_before_scale.groupby('label').mean()

Unnamed: 0_level_0,indus,dis,medv
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,19.188827,2.032289,15.865363
1,3.95791,3.864451,37.365672
2,8.35586,3.963392,21.374522
3,18.84,1.62871,49.13
4,4.202529,7.410669,24.063218


## PimaIndianDiabetes

In [28]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [30]:
pid_data = pd.read_csv(f"{path}/PimaIndiansDiabetes.csv")
pid_data

pid_y = pid_data['diabetes']
pid_X = pid_data.drop(['diabetes'], axis=1)


scaler = StandardScaler()
scaler.fit(pid_X)
pid_X_scaled = scaler.transform(pid_X)
train_X, test_X, train_y, test_y = \
    train_test_split(pid_X, pid_y, test_size=0.3,\
                     random_state=123) 

model =  KNeighborsClassifier(n_neighbors=5)

model.fit(train_X, train_y)

tr_y = model.predict(train_X)

tr_acc = accuracy_score(train_y, tr_y)

print('Train Accuracy : {0:3f}'.format(tr_acc))

pred_y = model.predict(test_X)

acc = accuracy_score(test_y, pred_y)
print('Test Accuracy : {0:3f}'.format(acc))


f_acc = f1_score(test_y, pred_y, average='macro')
print('F1 score: {0:3f}'.format(f_acc))

pr_acc = precision_score(test_y, pred_y, average='macro')
print('Precision score: {0:3f}'.format(pr_acc))

recall_acc = recall_score(test_y, pred_y, average='macro')
print('Recall score: {0:3f}'.format(recall_acc))

Train Accuracy : 0.811918
Test Accuracy : 0.727273
F1 score: 0.701397
Precision score: 0.711538
Recall score: 0.696678


In [33]:
acc_list = []

for i in range(1, 11):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    acc = accuracy_score(test_y, pred_y)
    acc_list.append(acc)
print(acc_list)
print(max(acc_list))

[0.696969696969697, 0.70995670995671, 0.70995670995671, 0.70995670995671, 0.7272727272727273, 0.70995670995671, 0.7489177489177489, 0.7272727272727273, 0.7748917748917749, 0.7532467532467533]
0.7748917748917749


In [34]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [39]:
pid_data = pd.read_csv(f"{path}/PimaIndiansDiabetes.csv")
pid_y = pid_data['diabetes']
pid_X = pid_data.drop(['diabetes'], axis=1)

In [49]:
kf = KFold(n_splits=10, random_state=123, shuffle=True)
model = KNeighborsClassifier(n_neighbors=5)
acc = []
fold = 0

for train_index, test_index in kf.split(pid_X):
    # Split 10 times from KFold
    print("fold:", fold)
    
    train_X, test_X = pid_X.iloc[train_index], pid_X.iloc[test_index]
    train_y, test_y = pid_y.iloc[train_index], pid_y.iloc[test_index]

    # Train the model using the training sets
    model.fit(train_X, train_y)

    # Make predictions using the testing set
    pred_y = model.predict(test_X)
    #print(pred_y)

    # model evaluation: accuracy #############
    accuracy = accuracy_score(test_y, pred_y)
    print('Accuracy : {0:3f}'.format(accuracy))
    acc.append(accuracy)
    fold += 1

print("10 fold :", acc)
print("mean accuracy :", np.mean(acc))


fold: 0
Accuracy : 0.792208
fold: 1
Accuracy : 0.701299
fold: 2
Accuracy : 0.714286
fold: 3
Accuracy : 0.610390
fold: 4
Accuracy : 0.727273
fold: 5
Accuracy : 0.727273
fold: 6
Accuracy : 0.714286
fold: 7
Accuracy : 0.740260
fold: 8
Accuracy : 0.644737
fold: 9
Accuracy : 0.763158
10 fold : [0.7922077922077922, 0.7012987012987013, 0.7142857142857143, 0.6103896103896104, 0.7272727272727273, 0.7272727272727273, 0.7142857142857143, 0.7402597402597403, 0.6447368421052632, 0.7631578947368421]
mean accuracy : 0.7135167464114833


In [51]:
kf = KFold(n_splits=10, random_state=123, shuffle=True)
model = KNeighborsClassifier(n_neighbors=5)
acc = []
fold = 0

for train_index, test_index in kf.split(pid_X):
    # Split 10 times from KFold
    print("fold:", fold)
    
    train_X, test_X = pid_X.iloc[train_index], pid_X.iloc[test_index]
    train_y, test_y = pid_y.iloc[train_index], pid_y.iloc[test_index]

    # Train the model using the training sets
    model.fit(train_X, train_y)

    # Make predictions using the testing set
    pred_y = model.predict(test_X)
    #print(pred_y)

    # model evaluation: accuracy #############
    accuracy = accuracy_score(test_y, pred_y)
    print('Accuracy : {0:3f}'.format(accuracy))
    acc.append(accuracy)
    fold += 1

print(f"highest test accuracy: {max(acc)}")
print(f"fold: {acc.index(max(acc))}")

fold: 0
Accuracy : 0.792208
fold: 1
Accuracy : 0.701299
fold: 2
Accuracy : 0.714286
fold: 3
Accuracy : 0.610390
fold: 4
Accuracy : 0.727273
fold: 5
Accuracy : 0.727273
fold: 6
Accuracy : 0.714286
fold: 7
Accuracy : 0.740260
fold: 8
Accuracy : 0.644737
fold: 9
Accuracy : 0.763158
highest test accuracy: 0.7922077922077922
fold: 0
