In [1]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip

--2024-08-27 06:58:24--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip’

bike+sharing+datase     [    <=>             ] 273.43K   276KB/s    in 1.0s    

2024-08-27 06:58:25 (276 KB/s) - ‘bike+sharing+dataset.zip’ saved [279992]



In [2]:
!unzip bike+sharing+dataset.zip

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [4]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from category_encoders import TargetEncoder

# Load dataset
df = pd.read_csv('hour.csv')

# Feature engineering
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)

# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

# Creating interaction features
X['temp_hum'] = X['temp'] * X['hum']
X['temp_windspeed'] = X['temp'] * X['windspeed']

# Numerical features
numerical_features = ['temp', 'hum', 'windspeed', 'temp_hum', 'temp_windspeed']
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', MinMaxScaler())  # Normalize
])
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

# Categorical features
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('target_encode', TargetEncoder())  # Target encoding
])
X_encoded = categorical_pipeline.fit_transform(X[categorical_features], y)

# Combine all features
X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

# Convert all column names to strings to avoid type issues
X.columns = X.columns.astype(str)

# Check for NaN values in the entire dataset
print(X.isnull().sum())  # Verify there are no NaNs

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the training data for NaNs before fitting
print(X_train.isnull().sum())
print(X_test.isnull().sum())

# Train Linear Regressor using the package
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predictions
y_pred_linear = linear_model.predict(X_test)

# Evaluate performance
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
print(f'Linear Regression - Mean Squared Error: {mse_linear}')
print(f'Linear Regression - R-squared: {r2_linear}')

yr                0
mnth              0
hr                0
holiday           0
weekday           0
workingday        0
temp              0
atemp             0
hum               0
windspeed         0
temp_hum          0
temp_windspeed    0
0                 0
1                 0
2                 0
dtype: int64
yr                0
mnth              0
hr                0
holiday           0
weekday           0
workingday        0
temp              0
atemp             0
hum               0
windspeed         0
temp_hum          0
temp_windspeed    0
0                 0
1                 0
2                 0
dtype: int64
yr                0
mnth              0
hr                0
holiday           0
weekday           0
workingday        0
temp              0
atemp             0
hum               0
windspeed         0
temp_hum          0
temp_windspeed    0
0                 0
1                 0
2                 0
dtype: int64
Linear Regression - Mean Squared Error: 14974.133860641094
Li

In [13]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Ensure there are no missing values in the features and target variable
print("Checking for NaNs in X_train:")
print(pd.DataFrame(X_train).isna().sum())
print("Checking for NaNs in y_train:")
print(pd.Series(y_train).isna().sum())

# Convert data to numpy arrays
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()

# Add a bias term (intercept) to the features
X_train_bias = np.c_[np.ones(X_train_np.shape[0]), X_train_np]  # Add bias term
X_test_bias = np.c_[np.ones(X_test_np.shape[0]), X_test_np]  # Add bias term

# Normalize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_bias = scaler.fit_transform(X_train_bias)
X_test_bias = scaler.transform(X_test_bias)

# Initialize parameters
n_features = X_train_bias.shape[1]
weights = np.zeros(n_features)

# Hyperparameters
learning_rate = 0.01
n_iterations = 1000

# Gradient Descent
for iteration in range(n_iterations):
    predictions = X_train_bias.dot(weights)
    errors = predictions - y_train_np
    gradient = (2 / len(y_train_np)) * X_train_bias.T.dot(errors)
    weights -= learning_rate * gradient

    # Debugging: Check for NaN values in weights
    if np.any(np.isnan(weights)):
        raise ValueError("Weights contain NaN values during training.")

# Make predictions
y_pred_scratch = X_test_bias.dot(weights)

# Check and handle NaN values in predictions
if np.any(np.isnan(y_pred_scratch)):
    raise ValueError("Predictions contain NaN values.")

# Evaluate performance
mse_scratch = mean_squared_error(y_test_np, y_pred_scratch)
r2_scratch = r2_score(y_test_np, y_pred_scratch)
print(f'Linear Regression (Scratch) - Mean Squared Error: {mse_scratch}')
print(f'Linear Regression (Scratch) - R-squared: {r2_scratch}')

Checking for NaNs in X_train:
yr                0
mnth              0
hr                0
holiday           0
weekday           0
workingday        0
temp              0
atemp             0
hum               0
windspeed         0
temp_hum          0
temp_windspeed    0
0                 0
1                 0
2                 0
dtype: int64
Checking for NaNs in y_train:
0
Linear Regression (Scratch) - Mean Squared Error: 49549.98200732639
Linear Regression (Scratch) - R-squared: -0.5647983056603776


In [14]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-2.15.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.15.1 (from mlflow)
  Downloading mlflow_skinny-2.15.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.15.1->mlflow)
  Downloading databricks_sdk-0.31.0-py3-none-any.whl.metadata (37 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.15.1->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.m

In [18]:
import mlflow
import mlflow.sklearn

# Convert categorical columns to float64
X_test_float = X_test.copy()
categorical_columns = ['yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday']
X_test_float[categorical_columns] = X_test_float[categorical_columns].astype('float64')

# Set up MLflow tracking
mlflow.start_run()

# Log parameters, metrics, and models
mlflow.log_param("Model Type", "LinearRegression")
mlflow.log_param("Interaction Features", "temp_hum, temp_windspeed")

# Log metrics for package-based Linear Regression
mlflow.log_metric("MSE Package", mse_linear)
mlflow.log_metric("R2 Package", r2_linear)

# Log metrics for scratch Linear Regression
mlflow.log_metric("MSE Scratch", mse_scratch)
mlflow.log_metric("R2 Scratch", r2_scratch)

# Provide an input example for the model (first row of X_test_float)
input_example = X_test_float.iloc[[0]]

# Log the package-based model with input example and signature
mlflow.sklearn.log_model(linear_model, "model_package", input_example=input_example)

# End the MLflow run
mlflow.end_run()

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [20]:
!pip install scikit-learn category_encoders




In [21]:
# Import necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn import set_config
from category_encoders import TargetEncoder

# Numerical preprocessing pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', MinMaxScaler())  # Scale features to a range
])

# Categorical preprocessing pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('target_encode', TargetEncoder())  # Encode categorical variables using target encoding
])

# Combine both numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Create a final pipeline that includes both preprocessing and model training
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocessing step
    ('model', LinearRegression())    # Linear regression model
])

# Set up the visualization of the pipeline
set_config(display='diagram')  # To display the pipeline as a diagram in Jupyter Notebook

# Visualize the final pipeline
final_pipeline