# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed, as well as Scikit-Learn ≥0.20. You don't need to change any code in the following cell, just run it.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.preprocessing import StandardScaler

# Common imports
import numpy as np
import os
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_linear_models"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Read the dataset 

In [2]:
# fields are separated by comma and skip the first row (titles) when reading
data = np.loadtxt("hw03_data.txt", skiprows=1, dtype = "float", delimiter=",")

# Scale the data with sklearn StandardScaler
# Write your code here

# create instance of StandardScaler
scaler = sklearn.preprocessing.StandardScaler()

# fit and scale the data
scaled_data = scaler.fit_transform(data)

# Designate scaled and unscaled X/y values
X = scaled_data[:, [0]]
y = scaled_data[:, [1]]
X_unscaled = data[:, [0]]
y_unscaled = data[:, [1]]

OSError: hw03_data.txt not found.

# Visualize the dataset

In [7]:
# Write your code here

# Generate a plot of the original data
plt.plot(X_unscaled, y_unscaled, "b.")
plt.xlabel("R&D Spending", fontsize=18)
plt.ylabel("Profit", rotation=0, fontsize=18)
plt.xticks(fontsize=8)
save_fig("generated_data_plot")
plt.show()

NameError: name 'X_unscaled' is not defined

# Training with scikit_learn Linear Regression 



In [60]:
# Write your code here

# Import the train test split
from sklearn.model_selection import train_test_split

# Train Test Split, 80/20, setting random state to 42 to be consitent with the lab
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

# Import Linear Regression
from sklearn.linear_model import LinearRegression

# b. Fit the training data to the model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg.intercept_, lin_reg.coef_

# c. Calculating the Intercept and the Coefficient
intercept = lin_reg.intercept_[0]
coefficient = lin_reg.coef_[0][0]
print("Intercept:", intercept)
print("Coefficient:", coefficient)

Intercept: -0.0011548333575354423
Coefficient: 0.9442141523225853


# Prediction
Now we can make predictions using the weights.

In [61]:
# Write your code here

# create a prediction set from the X test points
y_pred = regressor.predict(X_test)

# Get values of intercept and coefficient
regressor.intercept_, regressor.coef_

(array([-0.00115483]), array([[0.94421415]]))

# Evaluation
Best possible $R^{2}$ score  is 1.0. <br>

In [66]:
# Calculating the R squared value
# Write your code here

# compute the R-Squared value using Y prediction vs Y Test points
sklearn.metrics.r2_score(y_test, y_pred)

0.9134554891584079

# Linear regression using gradient descent
Solve the same problem with gradient descent algorithm

In [118]:
#Randomly generate starting weights
weight = np.random.randn(2,1) 

# Add x0 = 1 for each instance
X_train_b = np.c_[np.ones((800, 1)), X_train]

# Set the learning rate
alpha = 0.2

# Define the number of iterations
n_iterations = 1000

# Set m to the size of the training data
m = len(X_train_b)

# Run for iterations
for iteration in range(n_iterations):
    
    # Calulate the gradients
    gradients = 1/m * X_train_b.T.dot(X_train_b.dot(weight) - y_train)
    
    # Update the Weights
    weight = weight - alpha * gradients

Print out the weight values

In [133]:
# Print the weights
print(weight)

[[-0.00115483]
 [ 0.94421415]]


Calculate Prediction Test

In [141]:
# Add x0 = 1 for each test point
X_test_b = np.c_[np.ones((200, 1)), X_test]

# Run the dot product to predict the values
y_predict = X_test_b.dot(weight)

Calculate $R^{2}$ value

In [157]:
# Calulcate the score
sklearn.metrics.r2_score(y_test, y_pred)

0.9134554891584079

# Improving the $R^{2}$ value

Reason : The reason for a lower R-squared value in linear regression could be due to violations of assumptions like linearity, outliers, multicollinearity, or underfitting. To improve it, check data for these issues, consider more features, or explore more complex models like polynomial regression if the relationship is non-linear.

In [3]:
# Load the data
data = np.loadtxt("hw03_data.txt", skiprows=1, dtype="float", delimiter=",")
X = data[:, [0]]
y = data[:, [1]]

# Scale the data with StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Normalize the data
X_train_normalized = (X_train - X_train.mean()) / X_train.std()
y_train_normalized = (y_train - y_train.mean()) / y_train.std()

# Initialize weights
weight = np.random.randn(2, 1)

# Hyperparameters
alpha = 0.1  # learning rate
n_iterations = 1000
m = len(X_train_normalized)

# Add bias term to X_train_normalized
X_train_b = np.c_[np.ones((m, 1)), X_train_normalized]

# Gradient Descent
for iteration in range(n_iterations):
    gradients = 2/m * X_train_b.T.dot(X_train_b.dot(weight) - y_train_normalized)
    weight = weight - alpha * gradients

# Prepare test data
X_test_normalized = (X_test - X_train.mean()) / X_train.std()
X_test_b = np.c_[np.ones((len(X_test), 1)), X_test_normalized]

# Predict and calculate R-squared
y_pred_normalized = X_test_b.dot(weight)
y_pred = (y_pred_normalized * y_train.std()) + y_train.mean()
r2 = r2_score(y_test, y_pred)
print("R-squared value:", r2)


OSError: hw03_data.txt not found.