Bryan Chen

Machine Learning HW 2

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# train is 2004, test is 2000 election data
names = ['obesity', 'train', 'test', 'state']
df = pd.read_csv('obesity-election.data.txt', header=None, names=names)

x_train = x_test = np.array(df['obesity'])
y_train = np.array(df['train'])
y_test = np.array(df['test'])
predictions = []

In [None]:
neighbors = [x for x in range(50) if x % 2 != 0]
cv_scores = []

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train.reshape(-1, 1), y_train, cv=LeaveOneOut(), scoring='accuracy')
    cv_scores.append(scores.mean())

MSE = [1 - x for x in cv_scores]
optimal_k = neighbors[cv_scores.index(max(cv_scores))]
print('The optimal number of neighbors is', optimal_k)

plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()

The optimal number of neighbors is 17. The cross validation score of the optimal number is
0.68627 and the mean square error is 0.31373. The accuracy of the classifier is 68%. Obesity rate
is not a good factor to predict the election result and other factors should be taken into
consideration for a good model.

In [None]:
def predict(x_train, y_train, x_test, k):
    distances, targets = [], []

    for i in range(len(x_train)):
        distance = np.sqrt(np.sum(pow(x_test - x_train[i], 2)))
        distances.append([distance, i])
    distances.sort()

    for i in range(k):
        index = distances[i][1]
        targets.append(y_train[index])

    return Counter(targets).most_common(1)[0][0]


def k_nearest_neighbor(x_train, y_train, x_test, predictions, k):
    # training is already done in knn classifier
    for i in range(len(x_test)):
        predictions.append(predict(x_train=x_train, y_train=y_train, x_test=x_test[i], k=k))

In [None]:
try:
    k_nearest_neighbor(x_train=x_train, y_train=y_train, x_test=x_test, predictions=predictions, k=optimal_k)
    predictions = np.asarray(predictions)
    accuracy = accuracy_score(y_test, predictions) * 100
    print('The accuracy of KNN classifier is %d%%' % accuracy)
except ValueError:
    print('Can\'t have more neighbors than training samples!!')

State obesity_rate 2004_result prediction 2000_result (GT)
Alabama 0.301 R R R
Alaska 0.273 R R R
Arizona 0.233 R D R
Arkansas 0.281 R R R
California 0.231 D D D
Colorado 0.21 R D R
Connecticut 0.208 D D D
Delaware 0.221 D D D
D.C. 0.259 D R D
Florida 0.233 R D R
Georgia 0.275 R R R
Hawaii 0.207 D D D
Idaho 0.246 R D R
Illinois 0.253 D R D
Indiana 0.275 R R R
Iowa 0.263 R R D
Kansas 0.258 R R R
Kentucky 0.284 R R R
Louisiana 0.295 R R R
Maine 0.237 D D D
Maryland 0.252 D D D
Massachusetts 0.209 D D D
Michigan 0.277 D R D
Minnesota 0.248 D D D
Mississippi 0.344 R R R
Missouri 0.274 R R R
Montana 0.217 R D R
Nebraska 0.265 R R R
Nevada 0.236 R D R
New Hampshire 0.236 D D R
New Jersey 0.229 D D D
New Mexico 0.233 R D D
New York 0.235 D D D
North Carolina 0.271 R R R
North Dakota 0.259 R R R
Ohio 0.269 R R R
Oklahoma 0.281 R R R
Oregon 0.25 D D D
Pennsylvania 0.257 D R D
Rhode Island 0.214 D D D
South Carolina 0.292 R R R
South Dakota 0.261 R R R
Tennessee 0.29 R R R
Texas 0.272 R R R
Utah 0.218 R D R
Vermont 0.211 D D D
Virginia 0.252 R D R
Washington 0.245 D D D
West Virginia 0.306 R R R
Wisconsin 0.255 D R D
Wyoming 0.24 R D R

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz

names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

# loading training data
df = pd.read_csv('iris.data.txt', header=None, names=names)
df.head()

predictions = []

# create design matrix X and target vector y
X = np.array(df.ix[:, 0:4])     # end index is exclusive
y = np.array(df['class'])   # another way of indexing a pandas df
clf = DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X, y)

The petal_width attribute was used as the first decision node of this tree generated by the
decision tree classifier.

In [None]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],  
                         class_names=["Iris-setosa", "Iris-versicolor", "Iris-virginica"],  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)
graph.render("DecisionTree")

Information gain calculations:

E(class) 
= P(Iris-setosa)E(Iris-setosa) + P(Iris-versicolor)E(Iris-versicolor) + P(Iris-virginica)E(Iris-
virginica) 
= -((50/150)log_2(50/150)) * 3 
= 1.5850

E(class, pw0.8)
= P(low)E(low) + P(high)E(high)
= (50/150) * [-((50/50)log_2(50/50))-0-0]+ (100/150) * [0-((50/100)log_2(50/100))-
((50/100)log_2(50/100))]
= 0.6667

G(class, pw0.8)= 1.5850-0.6667 = 0.918

E(class, pw1.75)
= P(low)E(low) + P(high)E(high)
= (104/150) * [-((50/104)log_2(50/104))-((49/104)log_2(49/104))-((5/104)log_2(5/104))] +
(46/150) * [0-((1/46)log_2(1/46))-((45/46)log_2(45/46))]
= 0.8992

G(class, pw1.75)= 1.5850–0.8992= 0.686

E(class, pl4.95)
= P(low)E(low) + P(high)E(high)
= (104/150) * [-((50/104)log_2(50/104)) -((48/104)log_2(48/104)) - ((6/104)log_2(6/104))] +
(46/150) * [0-((2/46)log_2(2/46))-((44/46)log_2(44/46))]
= 0.9529

G(class, pl4.95)= 1.5850–0.9529= 0.632

E(class, pw1.65)
= P(low)E(low) + P(high)E(high)
= (102/150) * [-((50/102)log_2(50/102))-((48/102)log_2(48/102))-((4/102)log_2(4/102))] +
(48/150) * [0-((2/48)log_2(2/48))-((46/48)log_2(46/48))]
= 0.8954

G(class, pw1.65)= 1.5850-0.8954= 0.690

E(class, pw1.55)
= P(low)E(low) + P(high)E(high)
= (98/150) * [-((50/98)log_2(50/98))-((45/98)log_2(45/98))-((3/98)log_2(3/98))] + (52/150) * 
[0-((5/52)log_2(5/52))-((47/52)log_2(47/52))]
= 0.9194

G(class, pw1.55)= 1.5850-0.9194 = 0.666

E(class, sl6.95)
= P(low)E(low) + P(high)E(high)
= (137/150) * [-((50/137)log_2(49/137))-((49/137)log_2(49/137))-((38/137)log_2(38/137))] +
(13/150) * [0-((1/13)log_2(1/13))-((12/13)log_2(12/13))]
= 1.4816

G(class, sl6.95)= 1.5850-1.4816= 0.103

E(class, pl4.85)
= P(low)E(low) + P(high)E(high)
= (99/150) * [-((50/99)log_2(50/99))-((46/99)log_2(46/99))-((3/99)log_2(3/99))] + (51/150) *
[0-((4/51)log_2(4/51))-((47/51)log_2(47/51))]
= 0.9034

G(class, pl4.85)= 1.5850-0.9034 = 0.6816

E(class, pl5.95)
= P(low)E(low) + P(high)E(high)
= (83/150) * [-((50/83)log_2(50/83))-((26/83)log_2(26/83))-((7/83)log_2(7/83))] + (67/150) *
[0-((24/67)log_2(24/67))-((43/67)log_2(43/67))]
= 1.1209

G(class, pl5.95)= 1.5850-1.1209 = 0.4641

The best classification is the first classification, which is to see whether petal_width is smaller
than 0.8 and the gain is 0.918, which is very close to 1, which is the highest among the 8
calculations.

In [None]:
from numpy import polyfit, polyval
from sklearn.metrics import mean_squared_error
from math import log

x = np.random.uniform(low=0, high=1, size=10)
noise =  np.random.normal(loc=0, scale=0.5, size=10)
y = [pow(x[i], 2) + 0.1 * x[i] + noise[i] for i in range(10)]

import numpy as np
from numpy import polyfit
from numpy import polyval
from sklearn.metrics import mean_squared_error
from math import log
import matplotlib.pyplot as plt
import pandas as pd

X = np.random.uniform(0,1,10)
y = np.zeros(shape=(10,))
for i in range(len(X)):    
    mu = 0
    sigma = 0.5
    theta = np.random.normal(mu, sigma, 1)   # another way of indexing a pandas df
    y[i] = np.square(X[i]) + 0.1 * X[i] + theta
    
f = open('problem4.csv','w')
f.write('m'+','+'DoF'+','+'Remp'+','+'r'+','+'Rpen'+'\r\n')
for i in range(9):
    pi = polyfit(X,y,i)
    MSE = mean_squared_error(y, polyval(pi,X))
    p = MSE / len(X)
    r = 1 + p * (1-p) ** -1 * log(len(X))
    Rpen = r * MSE
  #  title = "DoF = "+ str(i)
    f.write(str(i)+','+str(i+1)+','+'%.3f'%MSE+','+
                   '%.3f'%r+','+'%.3f'%Rpen+'\r\n')
    pi = np.poly1d(pi)
    xp = np.linspace(0, 1, 100)
    plt.title("DoF = "+ str(i))
    plt.xlabel('Samples')
    plt.ylabel('Outputs')
    plt.plot(X, y, "o")
    _ = plt.plot(xp, pi(xp), '-')
    plt.figure(i+1)
    plt.show()
f.close()

print (pd.read_csv('problem4.csv'))

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge


kf = KFold(n_splits = 5)
loo = LeaveOneOut()

optim_data = []

for train_idx, test_idx in kf.split(x):
    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    for m in range(0,7):
        error = 0
        for train_index,validation_index in loo.split(x_train):
            x_trn, x_val = x_train[train_index], x_train[validation_index]
            y_trn, y_val = y_train[train_index], y_train[validation_index]
            coefficient = np.polyfit(x_trn,y_trn,m)
            error += abs(y_val[0] - poly_predict(x_val[0],coefficient))
        if m == 0:
            min_error = error
            best_m = m
            best_model = coefficient
        if error < min_error:
            best_m = m
            best_model = coefficient
    test_error = np.divide(np.sum(np.square(poly_predict(x_test,best_model) - y_test)),len(x_test))
    optim_data.append([test_idx,best_m,test_error])

file_p5 = open('problem5.csv','w')
file_p5.write('Fold #'+','+'Optimal m'+','+'Prediction accuracy'+'\r\n')
for (idx,item) in enumerate(optim_data):
    file_p5.write(str(idx)+','+str(item[1])+','+'%.3f'%item[2]+'\r\n')
file_p5.close()

print(pd.read_csv('problem5.csv'))