In [5]:
import csv
import numpy as np

def import_data(filename):
    raw_data = open(filename, 'rt')
    reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
    x = list(reader)
    data = np.array(x[1:]).astype('float')
    labels = np.array(x[0])
    X = data[:,:-1]
    y = data[:,-1]
    return labels,X,y

def square_error(a,b):
    err = a-b
    err_avg = err*err/err.shape[0]
    return np.sum(err_avg)

tr_labels, tr_X, tr_y = import_data("train.csv")
tst_labels, tst_X, tst_y = import_data("test.csv")
tr_X = tr_X[:,1:]
tst_X = tst_X[:,1:]

In [6]:
def linear_regression(X,y):
    A = np.matmul(np.transpose(X),X)
    b = np.matmul(np.transpose(X),y)
    w = np.linalg.solve(A,b)
    return w

def lasso(X,y,lmd):
    A = np.matmul(np.transpose(X),X) + lmd * np.identity(X.shape[1])
    b = np.matmul(np.transpose(X),y)
    w = np.linalg.solve(A,b)
    return w

def lasso_CV(X,y):
    perm = np.random.permutation(X.shape[0])
    perm_trn = perm[:int(0.8*X.shape[0])]
    perm_val = perm[int(0.8*X.shape[0]):]
    X_trn = X[perm_trn]
    y_trn = y[perm_trn]
    X_val = X[perm_val]
    y_val = y[perm_val]
    
    lmd = 1000
    lmd_lst = [100/pow(1.1,i) for i in range(20)]
    
    err_lst = [square_error(np.matmul(X_val,lasso(X_trn,y_trn,l)),y_val) for l in lmd_lst]
    min_lmd = lmd_lst[np.argmin(err_lst)]
    return min_lmd,err_lst

w = linear_regression(tr_X,tr_y)
y_pred = np.matmul(tst_X,w)
sqr_err = square_error(np.matmul(tst_X,w),tst_y)
print("Linear Rigression:")
print(sqr_err)

lmd,err_lst = lasso_CV(tr_X,tr_y)

#uncomment the following line to print the optimal value of lambda chosen
#print(lmd)
#print(err_lst)

w = lasso(tr_X,tr_y,lmd)
y_pred = np.matmul(tst_X,w)
sqr_err = square_error(np.matmul(tst_X,w),tst_y)
print("Ridge Regression:")
print(sqr_err)

Linear Rigression:
0.005051850193760437
Ridge Regression:
0.00515486083368509


In [7]:
from sklearn.linear_model import LogisticRegression

#function that maps a real number to the closest integer
def closest_integer(num):
    isneg = False
    if(num < 0):
        isneg = True
        num = num * -1
    
    flr = int(num)
    clst = flr
    if(num-flr > flr+1-num):
        clst = flr+1
    
    if(isneg):
        return clst* -1
    return clst

############################################################

#logistic regression with two classes
#converting tr_y into binary
binary_tr_y = []
for i in range(tr_y.shape[0]):
    if(tr_y[i] < 0.85):
        binary_tr_y.append(0)
    else:
        binary_tr_y.append(1)

log_reg = LogisticRegression(solver='lbfgs', max_iter = 2000).fit(tr_X,binary_tr_y)
pred = log_reg.predict_proba(tst_X)

binary_tst_y = []
for i in range(tst_y.shape[0]):
    if(tst_y[i] < 0.85):
        binary_tst_y.append(0)
    else:
        binary_tst_y.append(1)


# Uncomment the following line to print the average number of correct classifications
#print(log_reg.score(tst_X,binary_tst_y))

############################################################

#Logistic Regression with more than 2 classes
#alph denotes the number of classes. It is set to 4, but change it to see how it affects the error.
alph = 4
mult_tr_y = [closest_integer(i*alph) for i in [tr_y[j] for j in range(tr_y.shape[0])]]

log_reg = LogisticRegression(solver='sag', max_iter = 7500, multi_class='multinomial').fit(tr_X,mult_tr_y)
pred = log_reg.predict_proba(tst_X)

mult_tst_y = [closest_integer(i*alph) for i in [tst_y[j] for j in range(tst_y.shape[0])]]

#Uncomment the following lines to print the average number of correct classifications
#print("Fraction classified correctly:")
#print(log_reg.score(tst_X,mult_tst_y))

pred_max = [j/alph for j in [np.argmax(pred[i]) for i in range(pred.shape[0])]]

print("Square error for logistic regression with more than two classes =")
print(square_error(pred_max,tst_y))

############################################################

#Decision Tree with more than 2 classes
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion = 'entropy')
dtree = dtree.fit(tr_X,mult_tr_y)
y_pred = dtree.predict(tst_X)

#Uncomment the following the print the average number of correct classifications
#print(dtree.score(tst_X,mult_tst_y))

pred_avg = [i/alph for i in y_pred]
print("square error for decision trees with more than two classes =")
print(square_error(pred_avg,tst_y))

Logistic regression with more than two classes:
0.0637945
Decision tree with more than two classes:
0.005094499999999999


In [10]:
#regression decision tree
from sklearn.tree import DecisionTreeRegressor

dtree_reg = DecisionTreeRegressor()
dtree_reg = dtree_reg.fit(tr_X,tr_y)

y_pred = dtree_reg.predict(tst_X)

err = square_error(y_pred,tst_y)
print('Square error for regression decision trees =',err)

#Uncomment the following line to print the R2 score
#print(dtree_reg.score(tst_X,tst_y))

0.0


In [11]:
#regression decision tree with random spits
from sklearn.tree import DecisionTreeRegressor

dtree_reg = DecisionTreeRegressor(splitter = 'random')
dtree_reg = dtree_reg.fit(tr_X,tr_y)

y_pred = dtree_reg.predict(tst_X)

err = square_error(y_pred,tst_y)
print('Square error for regression decision trees with random splits =',err)

#Uncomment the following line to print the R2 score
#print(dtree_reg.score(tst_X,tst_y))

9.244463733058732e-35


In [12]:
#lasso regression (l_1 regularizer)
from sklearn.linear_model import LassoCV

lasso_reg = LassoCV(cv=5)
lasso_reg = lasso_reg.fit(tr_X,tr_y)

y_pred = lasso_reg.predict(tst_X)

err = square_error(y_pred,tst_y)
print('Square error for LASSO =',err)

#Uncomment the following line to print the R2 score
#print(lasso_reg.score(tst_X,tst_y))

0.0040237137591559585


In [13]:
#ridge regression (l_2 regularizer)
from sklearn.linear_model import RidgeCV

ridge_reg = RidgeCV(alphas = [1000/1.1**i for i in range(30)])
ridge_reg = ridge_reg.fit(tr_X,tr_y)

y_pred = ridge_reg.predict(tst_X)

err = square_error(y_pred,tst_y)
print('Square error for RIDGE regression =',err)

#Uncomment the following line to print the R2 score
#print(ridge_reg.score(tst_X,tst_y))

0.004398690164702783


In [40]:
#random forests regression
from sklearn.ensemble import RandomForestRegressor

rnd_forest = RandomForestRegressor(n_estimators=100)
rnd_forest = rnd_forest.fit(tr_X,tr_y)
y_pred = rnd_forest.predict(tst_X)

err = square_error(y_pred,tst_y)
print('Square error for random forests =',err)

#Uncomment the following line to print the R2 score
#print(rnd_forest.score(tst_X,tst_y))

0.0009070556799999981


In [14]:
#k-means and averaging to do the predictions.
from sklearn.cluster import KMeans

num_clusters = 128
tr_y_clusters = np.zeros((tr_y.shape[0],num_clusters)) 

kmeans = KMeans(n_clusters=num_clusters)
kmeans = kmeans.fit(tr_X)

cluster_labels = kmeans.predict(tr_X)

for i in range(tr_y.shape[0]):
    tr_y_clusters[i][cluster_labels[i]] = tr_y[i]
    
cluster_avg = [np.sum(tr_y_clusters[:,i])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

cluster_var = [np.sum([(x-cluster_avg[i])**2 for x in tr_y_clusters[:,i] if x != 0])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

#print(cluster_avg)
#print(max(cluster_var))

tst_cluster = kmeans.predict(tst_X)
y_pred = [cluster_avg[tst_cluster[i]] for i in range(tst_X.shape[0])]
err = square_error(y_pred,tst_y)
print('Square error for kmeans and averaging =',err)

0.00389748771789966


In [15]:
#mini-batch Kmeans with averaging
from sklearn.cluster import MiniBatchKMeans

num_clusters = 64
tr_y_clusters = np.zeros((tr_y.shape[0],num_clusters)) 

m_kmeans = MiniBatchKMeans(n_clusters=num_clusters)
m_kmeans = m_kmeans.fit(tr_X)

cluster_labels = m_kmeans.predict(tr_X)

for i in range(tr_y.shape[0]):
    tr_y_clusters[i][cluster_labels[i]] = tr_y[i]
    
cluster_avg = [np.sum(tr_y_clusters[:,i])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

cluster_var = [np.sum([(x-cluster_avg[i])**2 for x in tr_y_clusters[:,i] if x != 0])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

#print(cluster_avg)
#print(max(cluster_var))

tst_cluster = m_kmeans.predict(tst_X)
y_pred = [cluster_avg[tst_cluster[i]] for i in range(tst_X.shape[0])]
err = square_error(y_pred,tst_y)
print('Square error for minibatch kmeans and averaging =',err)

0.004986348260724208


In [21]:
A = np.ones((3,3))
print(A)
print(A[:,:2])
A[1,0] = 2
print(np.reshape(A,(-1,)))
print(A[:,0])

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
[[1. 1.]
 [1. 1.]
 [1. 1.]]
[1. 1. 1. 2. 1. 1. 1. 1. 1.]
[1. 2. 1.]


In [15]:
a = np.ones(3)
a[1] = 2
print(a*a)

[1. 4. 1.]


In [20]:
print(int(2.5))
A = np.ones((2,2))
B = np.ones((2,1))
print(A,B)
print(B.shape)
print(np.matmul(A,B))

2
[[1. 1.]
 [1. 1.]] [[1.]
 [1.]]
(2, 1)
[[2.]
 [2.]]
