In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error,explained_variance_score


In [2]:
#Loading dataset
df = pd.read_csv('processed_data.csv')

In [3]:
# Define the features and target variable
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['day_of_week'] = df['datetime'].dt.dayofweek
df['year'] = df['datetime'].dt.year
features = [
  col for col in df.columns if col not in ['datetime', 'price_log', 'price_boxcox', 'timezone']]
target = 'price_log'

# Select the features and target from the dataframe
X = df[features]
y = df[target]

# Optional: Convert categorical features to dummy variables
X = pd.get_dummies(X, drop_first=True)


In [4]:
#Splitting the dataset into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
#Training the model
# Define the model
model = LinearRegression()

# Define hyperparameters to tune
param_grid = {
    'fit_intercept': [True, False],
    'positive': [False, True]
}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

#Measuring the training time
start_time = time.time() #Record the start time
grid_search.fit(X_train, y_train) #Fitting the model
end_time = time.time() #Record the end time

#Calculate the duration for training
training_duration = end_time - start_time
print(f"Training Duration:{training_duration} seconds")

# Print the best hyperparameters found
print("Best Hyperparameters:")
print(grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Get cross-validation scores
cv_results = grid_search.cv_results_

# Extract mean test scores for each parameter combination
mean_test_scores = cv_results['mean_test_score']
std_test_scores = cv_results['std_test_score']

# Print the cross-validation scores
print("Cross-Validation Scores:")
for mean, std, params in zip(mean_test_scores, std_test_scores, cv_results['params']):
    print(f"Mean Score: {-mean:.4f}, Std Dev: {std:.4f} for {params}")

Training Duration:1.165724277496338 seconds
Best Hyperparameters:
{'fit_intercept': True, 'positive': True}
Cross-Validation Scores:
Mean Score: 0.0178, Std Dev: 0.0068 for {'fit_intercept': True, 'positive': False}
Mean Score: 0.0168, Std Dev: 0.0057 for {'fit_intercept': True, 'positive': True}
Mean Score: 0.0178, Std Dev: 0.0068 for {'fit_intercept': False, 'positive': False}
Mean Score: 0.0168, Std Dev: 0.0057 for {'fit_intercept': False, 'positive': True}


In [6]:
#Testing the model
y_pred = best_model.predict(X_test)


In [7]:
#Evaluating the model


# 1. Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# 2. Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Squared Error(MSE):{mse}')

# 3. Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')

# 4. Explained Variance Score
explained_variance = explained_variance_score(y_test, y_pred)
print(f'Explained Variance Score: {explained_variance}')

# 5. Adjusted R-squared (Adjusted RÂ²)
n = len(y_test)  # number of samples
p = X_test.shape[1]  # number of predictors
r2 = r2_score(y_test, y_pred)
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f'Adjusted R-squared: {adjusted_r2}')
print(f'R-squared:{r2}')



Mean Absolute Error (MAE): 0.08296233945567699
Root Mean Squared Error (RMSE): 0.11680026087606915
Mean Squared Error(MSE):0.013642300940717809
Mean Absolute Percentage Error (MAPE): 3.249988044144709%
Explained Variance Score: 0.9379623710455559
Adjusted R-squared: 0.5648623446327722
R-squared:0.9378374778046817
