# Bike Sharing Rental Demand Prediction
This notebook contains the complete end-to-end process from Data Cleaning to Model Enhancement.

## 1. Exploratory Data Analysis & Cleaning
In this step, we handle missing values, correct data types, and prepare the dataset.

In [None]:
import matplotlib
# matplotlib.use('Agg') # Non-interactive backend
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os

# Create images directory if it doesn't exist
if not os.path.exists('images'):
    os.makedirs('images')

# Load the dataset
try:
    df = pd.read_csv('Dataset.csv', encoding='latin1')
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# 1. Inspect Data Types and Clean
print("\n--- Initial Info ---")
print(df.info())

# Columns that should be numeric but are object
numeric_candidates = ['temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered']

print("\n--- Checking for non-numeric values ---")
for col in numeric_candidates:
    # Force convert to numeric, trace errors
    temp_series = pd.to_numeric(df[col], errors='coerce')
    n_errors = temp_series.isna().sum()
    if n_errors > 0:
        print(f"Column '{col}' has {n_errors} non-numeric entries (will be converted to NaN).")
        # specific examples
        invalid_mask = pd.to_numeric(df[col], errors='coerce').isna()
        print(f"Examples: {df.loc[invalid_mask, col].unique()[:5]}")
    
    # Apply conversion
    df[col] = temp_series

# Convert dteday to datetime
df['dteday'] = pd.to_datetime(df['dteday'], errors='coerce')

# Check for missing values after conversion
print("\n--- Missing Values After Cleaning ---")
print(df.isnull().sum()[df.isnull().sum() > 0])

# Fill missing values if any (for now just print, user decided strategy)
# Strategy: user said "Handle missing values: Detect... and apply appropriate imputation"
# If simple errors, maybe drop? Or impute?
# Let's see how many first.

# 2. Outlier Detection (Boxplots)
print("\n--- Generating Outlier Boxplots ---")
plt.figure(figsize=(15, 10))
# Use cleaned numeric cols plus 'cnt'
plot_cols = numeric_candidates + ['cnt']
# ensure they are in df
plot_cols = [c for c in plot_cols if c in df.columns]

for i, col in enumerate(plot_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df[col].dropna())
    plt.title(f'Boxplot of {col}')

plt.tight_layout()
plt.savefig('images/outliers_boxplot_cleaned.png')
print("Saved images/outliers_boxplot_cleaned.png")

# 3. Correlation Matrix
print("\n--- Generating Correlation Matrix ---")
plt.figure(figsize=(12, 10))
# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])
if not numeric_df.empty:
    sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.savefig('images/correlation_matrix.png')
    print("Saved images/correlation_matrix.png")
else:
    print("No numeric columns for correlation matrix.")

# Save cleaned data for next steps
df.to_csv('cleaned_dataset.csv', index=False)
print("\nSaved cleaned dataset to 'cleaned_dataset.csv'")


## 2. Data Visualization
Visualizing patterns in demand across different metrics.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
# matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.model_selection import train_test_split

if not os.path.exists('images'):
    os.makedirs('images')

# Load the cleaned and processed data
df_clean = pd.read_csv('cleaned_bike_data.csv')
df_proc = pd.read_csv('processed_bike_data.csv')
model = joblib.load('models/best_random_forest.joblib')

# --- 1. Rental Distribution ---
plt.figure(figsize=(10, 6))
sns.histplot(df_clean['cnt'], kde=True, color='purple')
plt.title('Distribution of Total Bike Rentals')
plt.xlabel('Rental Count')
plt.savefig('images/dist_cnt.png')

# --- 2. Hourly Demand ---
plt.figure(figsize=(12, 6))
sns.lineplot(x='hr', y='cnt', data=df_clean, ci=None, marker='o')
plt.title('Average Hourly Demand Pattern')
plt.xlabel('Hour of Day')
plt.ylabel('Average Count')
plt.savefig('images/hourly_trend.png')

# --- 3. Seasonal Demand ---
plt.figure(figsize=(10, 6))
sns.barplot(x='season', y='cnt', data=df_clean, palette='viridis')
plt.title('Rental Counts by Season')
plt.savefig('images/seasonal_demand.png')

# --- 4. Weather Situation Impact ---
plt.figure(figsize=(10, 6))
sns.boxplot(x='weathersit', y='cnt', data=df_clean)
plt.title('Impact of Weather Status on Rentals')
plt.savefig('images/weather_impact.png')

# --- 5. Temperature Influence ---
plt.figure(figsize=(10, 6))
sns.scatterplot(x='temp', y='cnt', data=df_clean, alpha=0.3, color='orange')
plt.title('Normalized Temperature vs Rental Count')
plt.savefig('images/temp_vs_count.png')

# --- 6. Working Day vs Holiday ---
plt.figure(figsize=(10, 6))
sns.barplot(x='workingday', y='cnt', hue='holiday', data=df_clean)
plt.title('Rentals: Working Day vs Holiday comparison')
plt.savefig('images/workingday_holiday.png')

# --- 7. Residual Plot (Actual vs Predicted) ---
target = 'cnt'
X = df_proc.drop(columns=['casual', 'registered', 'cnt'], errors='ignore')
y = df_proc[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_pred = model.predict(X_test)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='teal')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title('Actual vs Predicted Count')
plt.xlabel('Actual Count')
plt.ylabel('Predicted Count')
plt.savefig('images/actual_vs_predicted.png')

# --- 8. Feature Importance (Rich version) ---
importances = model.feature_importances_
feature_names = X.columns
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_df = feature_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 10))
sns.barplot(x='Importance', y='Feature', data=feature_df.head(20), palette='magma')
plt.title('Top 20 Drivers of Bike Demand')
plt.savefig('images/top_drivers.png')

print("Rich visualizations generated for PPT.")


## 3. Feature Engineering
Transforming time variables into cyclic features and encoding categorical data.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

if not os.path.exists('images'):
    os.makedirs('images')

# Load data
df = pd.read_csv('cleaned_bike_data.csv')

# --- 0. PRE-PROCESSING / FIXING CATEGORICALS ---
print("--- Fixing Categorical Data ---")

# Convert dteday to datetime first to recover info
df['dteday'] = pd.to_datetime(df['dteday'])

# Fix 'mnth' from dteday
# Sometimes 'mnth' has '?', but dteday is valid.
df['mnth'] = df['dteday'].dt.month
print("Fixed 'mnth' using dteday.")

# Fix 'yr'
# Map years to 0 (2011) and 1 (2012)
# If dteday year is 2011 -> 0, 2012 -> 1
df['yr'] = df['dteday'].dt.year.map({2011: 0, 2012: 1})
print("Fixed 'yr' using dteday.")

# Fix 'holiday' ('No', 'Yes', '?')
# Replace '?' with mode (usually 'No')
mode_holiday = df[df['holiday'] != '?']['holiday'].mode()[0]
df['holiday'] = df['holiday'].replace('?', mode_holiday)
# Map to 0/1
df['holiday'] = df['holiday'].map({'No': 0, 'Yes': 1}).astype(int)
print(f"Fixed 'holiday' (imputed '?' with '{mode_holiday}').")

# Fix 'workingday' ('No work', 'Working Day', '?')
# Replace '?' with mode
mode_working = df[df['workingday'] != '?']['workingday'].mode()[0]
df['workingday'] = df['workingday'].replace('?', mode_working)
# Map to 0/1
df['workingday'] = df['workingday'].map({'No work': 0, 'Working Day': 1}).astype(int)
print(f"Fixed 'workingday' (imputed '?' with '{mode_working}').")

# Fix 'weekday' just in case (already numeric but good to ensure consistency)
# 0: Sunday, 1: Monday... 6: Saturday (pandas .dow is 0=Mon, 6=Sun)
# Original dataset: "Weekday Day of the week" (0-6). Let's assume standard starts 0.
# We can just keep the original column if it was int64, which it was.

# --- 1. Feature Engineering ---

# A. Categorical Encoding (Season, Weathersit)
# Season: springer, summer, fall, winter (from unique values or implied)
# Weathersit: 1, 2, 3, 4 (stored as strings or objects in original? check)
# Let's ensure 'weathersit' is clean. 
# It might have '?' too.
if df['weathersit'].dtype == object:
    # Check for '?'
    mode_weather = df[df['weathersit'] != '?']['weathersit'].mode()[0]
    df['weathersit'] = df['weathersit'].replace('?', mode_weather)
    # Check if they are numeric strings '1','2','3','4' or words.
    # Assuming from description 1,2,3,4.
    # If they are words, pd.get_dummies handles them. If numbers, we treat as categorical.
    print("Unique weathersit:", df['weathersit'].unique())

# One-Hot Encoding
df = pd.get_dummies(df, columns=['season', 'weathersit'], prefix=['season', 'weather'], drop_first=True)
print("Applied One-Hot Encoding.")

# B. Cyclic Encoding
def encode_cyclic(df, col, max_val):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

# Ensure they are numeric
df['hr'] = pd.to_numeric(df['hr'], errors='coerce').fillna(0).astype(int)
df['mnth'] = pd.to_numeric(df['mnth'], errors='coerce').fillna(1).astype(int)
df['weekday'] = pd.to_numeric(df['weekday'], errors='coerce').fillna(0).astype(int)

df = encode_cyclic(df, 'hr', 24)
df = encode_cyclic(df, 'mnth', 12)
df = encode_cyclic(df, 'weekday', 7)
print("Applied Cyclic Encoding.")

# C. Scaling
scale_cols = ['temp', 'atemp', 'hum', 'windspeed']
scaler = MinMaxScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])
print("Scaled features.")

# D. Cleanup
drop_cols = ['dteday', 'instant']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Save
df.to_csv('processed_bike_data.csv', index=False)
print("Saved processed_bike_data.csv")

# Correlation
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix (Processed)')
plt.tight_layout()
plt.savefig('images/correlation_matrix_processed.png')
print("Saved correlation matrix.")


## 4. Model Building
Comparing Decision Tree, Random Forest, and Gradient Boosting.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

# Create models directory
if not os.path.exists('models'):
    os.makedirs('models')

# Load data
print("Loading data...")
try:
    df = pd.read_csv('processed_bike_data.csv')
except FileNotFoundError:
    print("Error: processed_bike_data.csv not found.")
    exit()

# Define Target and Features
# Target is 'cnt'
# We must DROP 'casual' and 'registered' because cnt = casual + registered (Data Leakage)
target = 'cnt'
drop_cols = ['casual', 'registered', 'cnt'] 
# Note: 'cnt' is in drop_cols just to define X, but we keep it for y.

X = df.drop(columns=drop_cols, errors='ignore')
y = df[target]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Initialize Models
models = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}

print("\n--- Model Training & Evaluation ---")
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluation
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}
    
    print(f"{name} Results:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R2:   {r2:.4f}")
    
    # Save model
    filename = f"models/{name.replace(' ', '_').lower()}.joblib"
    joblib.dump(model, filename)
    print(f"  Saved model to {filename}")

# Comparison Visualization
print("\n--- Generating Comparison Plot ---")
results_df = pd.DataFrame(results).T
print(results_df)

plt.figure(figsize=(10, 6))
results_df['R2'].plot(kind='barh', color='skyblue')
plt.title('Model Comparison - R2 Score')
plt.xlabel('R2 Score')
plt.xlim(0, 1)
plt.tight_layout()
plt.savefig('images/model_comparison_r2.png')

plt.figure(figsize=(10, 6))
results_df['RMSE'].plot(kind='barh', color='salmon')
plt.title('Model Comparison - RMSE (Lower is Better)')
plt.xlabel('RMSE')
plt.tight_layout()
plt.savefig('images/model_comparison_rmse.png')

print("\nModel building complete. Results saved.")


## 5. Hyperparameter Tuning
Optimizing the Random Forest model for better accuracy.

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load
try:
    df = pd.read_csv('processed_bike_data.csv')
    print("Data loaded.")
except FileNotFoundError:
    print("Run preprocessing first.")
    exit()

target = 'cnt'
drop_cols = ['casual', 'registered', 'cnt']
X = df.drop(columns=drop_cols, errors='ignore')
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Grid for Random Search
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=42)

print("Starting Randomized Search...")
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                               n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

rf_random.fit(X_train, y_train)

print("Best Parameters:", rf_random.best_params_)

best_rf = rf_random.best_estimator_

# Evaluation
y_pred = best_rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n--- Optimized Model Performance ---")
print(f"MAE:  {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2:   {r2:.4f}")

# Save best model
joblib.dump(best_rf, 'models/best_random_forest.joblib')
print("Saved best_random_forest.joblib")

# Feature Importance
importances = best_rf.feature_importances_
feature_names = X.columns
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_df = feature_df.sort_values(by='Importance', ascending=False)

print("\nTop 5 Features:")
print(feature_df.head())

plt.figure(figsize=(10, 8))
# Plot top 15
import seaborn as sns
sns.barplot(x='Importance', y='Feature', data=feature_df.head(15))
plt.title('Feature Importance (Optimized RF)')
plt.tight_layout()
plt.savefig('images/feature_importance.png')
print("Saved feature_importance.png")
