In [1]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

--2024-08-27 05:52:54--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip’

bike+sharing+datase     [  <=>               ] 273.43K  1023KB/s    in 0.3s    

2024-08-27 05:52:54 (1023 KB/s) - ‘bike+sharing+dataset.zip’ saved [279992]

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [2]:
pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce


class LinearRegressionFromScratch:
    def __init__(self):
        self.theta = None

    def fit(self, X, y):
        # Add a column of ones to X for the intercept term (bias)
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        # Calculate the coefficients using the normal equation
        self.theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

    def predict(self, X):
        # Add a column of ones to X for the intercept term (bias)
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        return X_b.dot(self.theta)

# Mean Squared Error from Scratch
def mean_squared_error_scratch(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

# R-squared from Scratch
def r2_score_scratch(y_true, y_pred):
    total_sum_of_squares = np.sum((y_true - np.mean(y_true)) ** 2)
    residual_sum_of_squares = np.sum((y_true - y_pred) ** 2)
    r2 = 1 - (residual_sum_of_squares / total_sum_of_squares)
    return r2

# Load the dataset
df = pd.read_csv('hour.csv')

# Feature Engineering
df['temp_hum'] = df['temp'] * df['hum']
df['temp_windspeed'] = df['temp'] * df['windspeed']
df['hum_windspeed'] = df['hum'] * df['windspeed']

# Convert 'day_night' to binary: 1 for day, 0 for night
df['day_night'] = df['hr'].apply(lambda x: 1 if 6 <= x <= 18 else 0)

# Drop unnecessary columns
df.drop(['instant', 'casual', 'registered', 'dteday'], axis=1, inplace=True)

# Convert categorical variables
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')

# Separating features and target variable
X = df.drop(columns=['cnt'])  # Features
y = df['cnt']  # Target

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Numerical and Categorical Features
numerical_features = ['temp', 'hum', 'windspeed', 'temp_windspeed', 'temp_hum', 'hum_windspeed']
categorical_features = ['season', 'weathersit', 'day_night']

# Pipeline for numerical preprocessing
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', MinMaxScaler())  # Normalize using MinMaxScaler
])

# Transform numerical features
X_train[numerical_features] = numerical_pipeline.fit_transform(X_train[numerical_features])
X_test[numerical_features] = numerical_pipeline.transform(X_test[numerical_features])

# Pipeline for categorical encoding
categorical_pipeline = Pipeline([
    ('target_encoder', ce.TargetEncoder(cols=categorical_features))
])

# Encode categorical features
X_train_encoded = categorical_pipeline.fit_transform(X_train[categorical_features], y_train)
X_test_encoded = categorical_pipeline.transform(X_test[categorical_features])

# Combine numerical and encoded features
X_train_full = pd.concat([X_train[numerical_features].reset_index(drop=True), X_train_encoded.reset_index(drop=True)], axis=1)
X_test_full = pd.concat([X_test[numerical_features].reset_index(drop=True), X_test_encoded.reset_index(drop=True)], axis=1)

# Convert to numpy arrays
X_train_np = X_train_full.values
X_test_np = X_test_full.values
y_train_np = y_train.values
y_test_np = y_test.values

# Train the Linear Regression model from scratch
model_scratch = LinearRegressionFromScratch()
model_scratch.fit(X_train_np, y_train_np)

# Predict on the test set
y_pred_scratch = model_scratch.predict(X_test_np)

# Calculate Mean Squared Error and R-squared from scratch
mse_scratch_custom = mean_squared_error_scratch(y_test_np, y_pred_scratch)
r2_scratch_custom = r2_score_scratch(y_test_np, y_pred_scratch)

# Print the custom implementation results
print(f'Mean Squared Error (Scratch, Custom): {mse_scratch_custom}')
print(f'R-squared (Scratch, Custom): {r2_scratch_custom}')


Mean Squared Error (Scratch, Custom): 19399.326598415304
R-squared (Scratch, Custom): 0.38736540030096966


In [8]:
final_pipeline = Pipeline([
    ('num_preprocess', numerical_pipeline),
    ('cat_preprocess', categorical_pipeline),
    ('model', LinearRegressionFromScratch())
])

In [9]:
from sklearn import set_config
set_config(display='diagram')  # To display the pipeline diagram
final_pipeline