In [1]:
!wget cs.uit.edu.vn/data2.txt

--2022-04-28 00:31:07--  http://cs.uit.edu.vn/data2.txt
Resolving cs.uit.edu.vn (cs.uit.edu.vn)... 118.69.123.142, 45.122.249.78
Connecting to cs.uit.edu.vn (cs.uit.edu.vn)|118.69.123.142|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1203726 (1.1M) [text/plain]
Saving to: ‘data2.txt’


2022-04-28 00:31:09 (723 KB/s) - ‘data2.txt’ saved [1203726/1203726]



In [2]:
!wget cs.uit.edu.vn/data4.txt

--2022-04-28 00:31:09--  http://cs.uit.edu.vn/data4.txt
Resolving cs.uit.edu.vn (cs.uit.edu.vn)... 118.69.123.142, 45.122.249.78
Connecting to cs.uit.edu.vn (cs.uit.edu.vn)|118.69.123.142|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 101704 (99K) [text/plain]
Saving to: ‘data4.txt’


2022-04-28 00:31:11 (143 KB/s) - ‘data4.txt’ saved [101704/101704]



In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [4]:
def load_data():
    train_df = pd.read_csv('/content/data2.txt', sep=',', names=['X', 'y'])
    test_df = pd.read_csv('/content/data4.txt', sep=',', names=['X', 'y'])

    X_train = train_df['X'].values.reshape(-1,1)
    y_train = train_df['y'].values

    X_test = test_df['X'].values.reshape(-1,1)
    y_test = test_df['y'].values
    return X_train, y_train, X_test, y_test

In [5]:
def data_preprocessing(X_train, y_train, X_test, y_test):
    scaler =  StandardScaler()

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    X_train = scaler.fit_transform(X_train)
    y_train = scaler.fit_transform(y_train)
    X_test = scaler.fit_transform(X_test)
    y_test = scaler.fit_transform(y_test)

    y_train = y_train.ravel()
    y_test = y_test.ravel()

    return X_train, y_train, X_test, y_test

In [6]:
def fn_frac(X_orig, n):
    res = X_orig
    for i in range(2, n + 1):
        res = np.hstack((res, X_orig**(1 / i)))
    return res

def fn(X_train, X_orig, n):
    for i in range(2, n + 1):
        X_train = np.hstack((X_train, X_orig**(i)))
    return X_train

In [7]:
def evaluate(X_train, y_train, X_test, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)

    train_predicted = model.predict(X_train)
    test_predicted = model.predict(X_test)

    print("Training")
    print(f"MSE: {mean_squared_error(y_train, train_predicted)}")
    print(f"R2: {r2_score(y_train, train_predicted)}")
    print("-----------------")
    print("Testing")
    print(f"MSE: {mean_squared_error(y_test, test_predicted)}")
    print(f"R2: {r2_score(y_test, test_predicted)}")

# Scale data

In [8]:
X_train, y_train, X_test, y_test = load_data()
X_train, y_train, X_test, y_test = data_preprocessing(X_train, y_train, X_test, y_test)
evaluate(X_train, y_train, X_test, y_test)

Training
MSE: 0.0024075006654141642
R2: 0.9975924993345858
-----------------
Testing
MSE: 0.044401469822167854
R2: 0.9555985301778321


# $(X, X^{\frac{1}{2}})$

In [22]:
X_train, y_train, X_test, y_test = load_data()

XTrain = fn_frac(X_train, 2)
XTest = fn_frac(X_test, 2)

evaluate(XTrain, y_train, XTest, y_test)

Training
MSE: 0.000122740426460149
R2: 0.9979390204396336
-----------------
Testing
MSE: 0.05544716229521401
R2: 0.8302193799992671


# $(X, X^{\frac{1}{2}}, X^{\frac{1}{3}})$

In [10]:
X_train, y_train, X_test, y_test = load_data()

XTrain = fn_frac(X_train, 3)
XTest = fn_frac(X_test, 3)

evaluate(XTrain, y_train, XTest, y_test)

Training
MSE: 0.0001219634863432889
R2: 0.997952066326361
-----------------
Testing
MSE: 0.023985477361671104
R2: 0.9265558587868514


# $(X, X^{\frac{1}{2}}, X^{\frac{1}{3}},X^{\frac{1}{4}})$

In [11]:
X_train, y_train, X_test, y_test = load_data()

XTrain = fn_frac(X_train, 4)
XTest = fn_frac(X_test, 4)

evaluate(XTrain, y_train, XTest, y_test)

Training
MSE: 0.00012190909344768164
R2: 0.9979529796574396
-----------------
Testing
MSE: 0.0164689694971035
R2: 0.9495715968816543


# $(X, X^{\frac{1}{2}}, X^{\frac{1}{3}}, X^{\frac{1}{4}}, X^{\frac{1}{5}})$

In [12]:
X_train, y_train, X_test, y_test = load_data()

XTrain = fn_frac(X_train, 5)
XTest = fn_frac(X_test, 5)

evaluate(XTrain, y_train, XTest, y_test)

Training
MSE: 0.00012190367180570886
R2: 0.9979530706942202
-----------------
Testing
MSE: 0.014608514645569198
R2: 0.9552683569159208


# $(X, X^{\frac{1}{2}}, X^{\frac{1}{3}}, X^{\frac{1}{4}}, X^{\frac{1}{5}}, X^{\frac{1}{6}})$

In [20]:
X_train, y_train, X_test, y_test = load_data()

XTrain = fn_frac(X_train, 6)
XTest = fn_frac(X_test, 6)

evaluate(XTrain, y_train, XTest, y_test)

Training
MSE: 0.00012190234606398448
R2: 0.9979530929552363
-----------------
Testing
MSE: 0.014541449175686096
R2: 0.9554737130890063


# $(X, X^{\frac{1}{2}}, X^{\frac{1}{3}}, X^{\frac{1}{4}}, X^{\frac{1}{5}}, X^{\frac{1}{7}})$

In [53]:
X_train, y_train, X_test, y_test = load_data()

XTrain = fn_frac(X_train, 7)
XTest = fn_frac(X_test, 7)

evaluate(XTrain, y_train, XTest, y_test)

Training
MSE: 0.00012190136869554198
R2: 0.9979531093665885
-----------------
Testing
MSE: 0.018326240338585917
R2: 0.9438845863670804
