## CPSC 474 Assignment 1

### Problem 1

In [3]:
import numpy as np
import pandas as pd
import sys

data = pd.read_csv("Data1.csv")

X = data.iloc[:,0:-1]
y = data.iloc[:, -1]

# LinearRegression Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

lr_model = LinearRegression().fit(X_train, y_train)
y_preds = lr_model.predict(X_test)

print("Linear Regression RMSE", np.sqrt(mse(y_preds, y_test)))
print("Linear Regression R^2", r2_score(y_preds, y_test))


# NonLinearRegression Model
from sklearn.preprocessing import PolynomialFeatures

for n in range(17, 25):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
    
    poly_func = PolynomialFeatures(degree = n)
    
    X_train = poly_func.fit_transform(X_train)
    X_test = poly_func.fit_transform(X_test)
    
    nlr_model = LinearRegression().fit(X_train, y_train)
    y_preds = nlr_model.predict(X_test)

    print(f"Nonlinear Regression w/ Polynomial {n} RMSE", np.sqrt(mse(y_preds, y_test)))
    print(f"Nonlinear Regression w/ Polynomial {n} R^2", r2_score(y_preds, y_test))

Linear Regression RMSE 0.13638115651216065
Linear Regression R^2 0.5845057380278651


KeyboardInterrupt: 

### Problem 2

In [None]:
import numpy as np
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, r2_score

data = pd.read_csv("Data1.csv")

X = data.iloc[:,0:-1]
y = data.iloc[:, -1]

def cal_cost(theta, X, y):
    m = len(y)

    predictions = X.dot(theta)

    cost = (1/2*m) * np.sum(np.square(predictions-y))

    return cost


def gradient_descent(X, y, theta, alpha, epoch):
    # alpha = 0.01
    # epoch = 100
    X = (X - X.mean()) / X.std() # scale data
    
    m = X.shape[0] # size of features
    X = np.concatenate((np.ones((m, 1)), X), axis=1) # add columns of 1 before X
    cost = np.zeros(epoch) # initialize cost
    n = X.shape[1]
    w = np.zeros(n) # initialize weight

    # m = len(y)
    # cost = np.zeros(epoch)
    # w = np.zeros(epoch, 2)

    for i in range(epoch):
        yhat = np.dot(X, w.T) # do prediction
        cost[i] = cal_cost(w.T, X, y) # calculate cost
        w[i, :] = theta.T
    
    return cost, w


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# LinearRegression Model
cost, w = gradient_descent(X_train, y_train, 1e-7, 50000)
X_test = np.concatenate((np.ones((X_test.shape[0], 1)), X_test), axis=1)
y_preds = np.dot(X_test, w.T)

print(cost[-1], w)
print("Linear Regression RMSE", np.sqrt(mse(y_preds, y_test)))
print("Linear Regression R^2", r2_score(y_preds, y_test))

# NonLinearRegression Model
from sklearn.preprocessing import PolynomialFeatures

for n in range(2, 12):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
    
    poly_func = PolynomialFeatures(degree = n)
    
    X_train = poly_func.fit_transform(X_train)
    X_test = poly_func.fit_transform(X_test)
    
    cost, w = gradient_descent(X_train, y_train, 0.01, 5000)
    X_test = np.concatenate((np.ones((X_test.shape[0], 1)), X_test), axis=1)
    y_preds = np.dot(X_test, w.T)
    # print(cost[-1], w)
    print(f"Nonlinear Regression w/ Polynomial {n} RMSE", np.sqrt(mse(y_preds, y_test)))
    print(f"Nonlinear Regression w/ Polynomial {n} R^2", r2_score(y_preds, y_test))

AttributeError: 'int' object has no attribute 'T'

### Problem 3

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, r2_score
from sklearn.preprocessing import PolynomialFeatures


def linear_least_square_approach(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
    lr_model = LinearRegression().fit(X_train, y_train)
    y_preds = lr_model.predict(X_test)

    print("Linear Regression RMSE", np.sqrt(mse(y_preds, y_test)))
    print("Linear Regression R^2", r2_score(y_preds, y_test))


# NonLinearRegression Model
def non_linear_least_square_approach(X_orig, y_orig):

    for n in range(2, 10):
        X = np.asarray(X_orig)
        y = np.asarray(y_orig)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

        poly_func = PolynomialFeatures(degree=n)

        X_train = poly_func.fit_transform(X_train)
        X_test = poly_func.fit_transform(X_test)

        nlr_model = LinearRegression().fit(X_train, y_train)
        y_preds = nlr_model.predict(X_test)

        print(f"Nonlinear Regression w/ Polynomial {n} RMSE", np.sqrt(mse(y_preds, y_test)))
        print(f"Nonlinear Regression w/ Polynomial {n} R^2", r2_score(y_preds, y_test))


# Normalization (minMax Scaler) scaling from 0 to 1
from sklearn.preprocessing import MinMaxScaler

data = pd.read_csv("Data1.csv")

# independent values
X = data.iloc[:, 0:-1]
# dependent values
y = data.iloc[:, -1]
print(data)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)
scaled_data = pd.DataFrame(scaled_data, columns=['T', 'P', 'TC', 'SV', 'Idx'])
print(scaled_data)
linear_least_square_approach(X, y)
non_linear_least_square_approach(X, y)

print("\nUsing Scaled Data")
scaled_x = scaled_data.iloc[:, 0:-1]
scaled_y = scaled_data.iloc[:, -1]
linear_least_square_approach(scaled_x, scaled_y)
non_linear_least_square_approach(scaled_x, scaled_y)

### Problem 4

In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error as mse, r2_score

data = pd.read_csv("Data1.csv")

X = data.iloc[:,0:-1]
y = data.iloc[:, -1]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 0.1)
lasso.fit(X_train, y_train)

print("score for testing lasso: ",lasso.score(X_test, y_test))
print("score for training lasso: ",lasso.score(X_train, y_train))

y_preds_L = lasso.predict(X_test)

print("Lasso Regression RMSE", np.sqrt(mse(y_preds_L, y_test)))
print("Lasso Regression R^2", r2_score(y_preds_L, y_test))

from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train, y_train)

print("score for testing ridge: ",ridge.score(X_test, y_test))
print("score for training ridge: ",ridge.score(X_train, y_train))

y_preds_R = ridge.predict(X_test)

print("Ridge Regression RMSE", np.sqrt(mse(y_preds_R, y_test)))
print("Ridge Regression R^2", r2_score(y_preds_R, y_test))

from sklearn.linear_model import ElasticNet

EN = ElasticNet(alpha=0.01)
EN.fit(X_train, y_train)

print("score for testing EN: ",EN.score(X_test, y_test))
print("score for training EN: ",EN.score(X_train, y_train))

y_preds_E = EN.predict(X_test)

print("EN Regression RMSE", np.sqrt(mse(y_preds_E, y_test)))
print("EN Regression R^2", r2_score(y_preds_E, y_test))



### Problem 5

In [None]:
from numpy import mean
from numpy import std
from numpy import absolute
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

#data input
data = pd.read_csv("Data1.csv")

X = data.iloc[:,0:-1]
y = data.iloc[:, -1]

#split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

#models
lasso = Lasso(alpha = 0.1)
ridge = Ridge(alpha = 0.1)
EN = ElasticNet(alpha = 0.1)

#fit
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)
EN.fit(X_train, y_train)

#evaluation methods
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

#cv Test calculation
score_Lasso_Test = cross_val_score(lasso, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
score_Ridge_Test = cross_val_score(ridge, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
score_EN_Test = cross_val_score(EN, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

score_Lasso_Test = absolute(score_Lasso_Test)
score_Ridge_Test = absolute(score_Ridge_Test)
score_EN_Test = absolute(score_EN_Test)

#cv Training
score_Lasso_Train = cross_val_score(lasso, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
score_Ridge_Train = cross_val_score(ridge, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
score_EN_Train = cross_val_score(EN, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

score_Lasso_Train = absolute(score_Lasso_Train)
score_Ridge_Train = absolute(score_Ridge_Train)
score_EN_Train = absolute(score_EN_Train)

#print test
print("-----------Test Values-----------")
print("Lasso Test Value: %.3f (%.3f)" % (mean(score_Lasso_Test), std(score_Lasso_Test)))
print("Ridge Test Value: %.3f (%.3f)" % (mean(score_Ridge_Test), std(score_Ridge_Test)))
print("EN Test Value: %.3f (%.3f)" % (mean(score_EN_Test), std(score_EN_Test)))

print()
print("-----------Train Values-----------")

#print train
print("Lasso Train Value: %.3f (%.3f)" % (mean(score_Lasso_Train), std(score_Lasso_Train)))
print("Ridge Train Value: %.3f (%.3f)" % (mean(score_Ridge_Train), std(score_Ridge_Train)))
print("EN Train Value: %.3f (%.3f)" % (mean(score_EN_Train), std(score_EN_Train)))

Lasso Value: 0.116 (0.000)
Ridge Value: 0.109 (0.000)
EN Value: 0.111 (0.000)
