In [1]:
import datetime
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import zipfile
import io
import json 

In [2]:
from sklearn import datasets,ensemble,model_selection
from scipy.stats import anderson_ksamp

In [3]:
import requests
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])

 zipfile.ZipFile(io.BytesIO(content))
- content: This is the binary content of the ZIP file, typically downloaded using requests.get(...).content.
- io.BytesIO(content): Wraps the binary content in a file-like object so it can be read by zipfile.
- zipfile.ZipFile(...): Opens the ZIP archive in memory, allowing access to its contents.
ðŸ”¹ arc.open("hour.csv")
- Accesses the file named "hour.csv" inside the ZIP archive.
- Returns a file-like object that can be read by pandas.
ðŸ”¹ pd.read_csv(...)
- Reads the CSV file into a pandas DataFrame.
- header=0: Uses the first row as column headers.
- sep=',': Specifies comma as the delimiter.
- parse_dates=['dteday']: Automatically parses the 'dteday' column as datetime objects.


In [4]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  int64         
 3   yr          17379 non-null  int64         
 4   mnth        17379 non-null  int64         
 5   hr          17379 non-null  int64         
 6   holiday     17379 non-null  int64         
 7   weekday     17379 non-null  int64         
 8   workingday  17379 non-null  int64         
 9   weathersit  17379 non-null  int64         
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null  int64         
 16  cnt         17379 non-

In [6]:
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday']

In [7]:
raw_data['dteday'].min(), raw_data['dteday'].max()

(Timestamp('2011-01-01 00:00:00'), Timestamp('2012-12-31 00:00:00'))

In [8]:
raw_data.set_index('dteday', inplace=True)

In [9]:
raw_data 

Unnamed: 0_level_0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01,1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
2011-01-01,2,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2011-01-01,3,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
2011-01-01,4,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
2011-01-01,5,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31,17375,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
2012-12-31,17376,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
2012-12-31,17377,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
2012-12-31,17378,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [10]:
reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [13]:
#null_hypothesis--> Two samples come from the same distribution
from scipy import stats

counter = 0 
for i in numerical_features:
    test,p_val = stats.ks_2samp(reference[i],current[i])

    if p_val < 0.05:
        counter +=1

In [14]:
(counter/7)*100

71.42857142857143

In [21]:
from scipy.stats import chi2_contingency

rejected_chi = 0 

def drift_chisq(sample_1,sample_2):
    return chi2_contingency([sample_1,sample_2])[1]

for i in categorical_features:
    pval_1 = drift_chisq(reference[i].value_counts(),current[i].value_counts())

    if pval_1 <.05:
        rejected_chi+=1

    

In [31]:
rejected_chi

0

In [38]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(reference[numerical_features + categorical_features],reference['cnt'],test_size = .3)

In [45]:
# COMPLETE MODEL BUILDING
from sklearn.ensemble import RandomForestRegressor 
regressor_model = RandomForestRegressor(random_state = 0 )
regressor_model.fit(X_train,y_train)
preds_test = regressor_model.predict(X_test )

In [46]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
ref_mae=mean_absolute_error(y_test,preds_test)
ref_mse=mean_squared_error(y_test,preds_test)
ref_r2 = r2_score(y_test,preds_test)

print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)

MAE 13.359193548387097
MSE 411.5931532258064
R2 0.8303827253204109


In [43]:
import mlflow

In [44]:
mlflow.set_experiment("Bicyle-Sharing")

2025/11/03 16:32:33 INFO mlflow.tracking.fluent: Experiment with name 'Bicyle-Sharing' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/Rahil/OneDrive/Documents/GitHub/Web_API/mlruns/422079373624276455', creation_time=1762167753027, experiment_id='422079373624276455', last_update_time=1762167753027, lifecycle_stage='active', name='Bicyle-Sharing', tags={}>

In [47]:
with mlflow.start_run():
    mlflow.set_tag('mlflow.runName','Refrence_run')
    mlflow.log_metric("MAE",ref_mae)
    mlflow.log_metric("MSE",ref_mse)
    mlflow.log_metric("R2",ref_r2)

    mlflow.sklearn.log_model(regressor_model,"regres_model")



In [48]:
!mlflow ui

^C


In [None]:
# after the above  check manually http://127.0.0.1:5000 

In [None]:
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

experiment_batches = [('2011-01-29 00:00:00','2011-02-07 23:00:00'),
              ('2011-02-08 00:00:00','2011-02-14 23:00:00'),
              ('2011-02-15 00:00:00','2011-02-21 23:00:00')]

In [56]:
current

Unnamed: 0_level_0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-29,619,1,0,1,0,0,6,0,1,0.22,0.1970,0.64,0.3582,2,26,28
2011-01-29,620,1,0,1,1,0,6,0,1,0.22,0.2273,0.64,0.1940,0,20,20
2011-01-29,621,1,0,1,2,0,6,0,1,0.22,0.2273,0.64,0.1642,0,15,15
2011-01-29,622,1,0,1,3,0,6,0,1,0.20,0.2121,0.64,0.1343,3,5,8
2011-01-29,623,1,0,1,4,0,6,0,1,0.16,0.1818,0.69,0.1045,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-02-28,1333,1,0,2,19,0,1,1,3,0.44,0.4394,0.88,0.6119,1,79,80
2011-02-28,1334,1,0,2,20,0,1,1,3,0.44,0.4394,0.88,0.6119,0,45,45
2011-02-28,1335,1,0,2,21,0,1,1,2,0.38,0.3939,0.87,0.3881,2,78,80
2011-02-28,1336,1,0,2,22,0,1,1,3,0.34,0.3030,0.93,0.4179,4,72,76


In [80]:
for i in experiment_batches:
    with mlflow.start_run():
        mlflow.set_tag('mlflow.runName','Run'+str(i[0])+":"+str(i[1]))
        current_data = current.loc[i[0]:i[1]]

        current_x = current_data[numerical_features + categorical_features]
        current_y   = current_data['cnt']
        current_pred = regressor_model.predict(current_x)

        mae= mean_absolute_error(current_y, current_pred)
        mse= mean_squared_error(current_y, current_pred)
        r2= r2_score(current_y, current_pred)
        

        mlflow.log_metric("MAE",ref_mae)
        mlflow.log_metric("MSE",ref_mse)
        mlflow.log_metric("R2",ref_r2)
        

In [None]:
!mlflow ui

In [76]:
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

experiment_batches_2 = [
              ('2011-02-08 00:00:00','2011-02-14 23:00:00'),
              ('2011-02-15 00:00:00','2011-02-21 23:00:00')]

In [77]:
for i in experiment_batches_2:
    with mlflow.start_run():
        mlflow.set_tag('mlflow.runName','Run_'+str(i[0])+":"+str(i[1]))
        current_data = current.loc[i[0]:i[1]]

        current_x = current_data[numerical_features + categorical_features]
        current_y   = current_data['cnt']
        current_pred = regressor_model.predict(current_x)

        mae= mean_absolute_error(current_y, current_pred)
        mse= mean_squared_error(current_y, current_pred)
        r2= r2_score(current_y, current_pred)
        

        mlflow.log_metric("MAE",ref_mae)
        mlflow.log_metric("MSE",ref_mse)
        mlflow.log_metric("R2",ref_r2)
        

In [78]:
!mlflow ui

^C
