## Importing Packages

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pickle

## Load the Data

In [None]:
merged = pd.read_csv('/content/drive/MyDrive/CSCI 5502 Data Mining/01 Project/Data/01_Preprocessed_Data/02_Master_Analysis_Data_With_Temp_10_31_12_07.csv')
merged.drop(columns='Unnamed: 0', inplace=True)
print("Does the DataFrame have nulls?", merged.isna().any().any())
merged.head()

Does the DataFrame have nulls? False


Unnamed: 0,cu_class_status,station_last_updated,status_last_reported,station_id,station_name,station_address,station_longitude,station_latitude,station_is_returning,station_is_renting,...,next_avl_dock_time,bike_wait_time,dock_wait_time,datetime_temperature,temperature_2m,precipitation_probability,rain,snowfall,snow_depth,visibility
0,Regular,1698732017,1698732020,bcycle_boulder_1855,Folsom & Colorado,SE corner of Folsom & Colorado,-105.26385,40.00811,1,1,...,2023-10-31 06:06:00,84.0,366.0,2023-10-31 00:00:00,1.8935,0,0.0,0,0.03,48700
1,Regular,1698732017,1698732020,bcycle_boulder_2763,20th & Pearl,1986 20th St.,-105.26952,40.01988,1,1,...,2023-10-31 08:39:00,603.0,519.0,2023-10-31 00:00:00,1.8935,0,0.0,0,0.03,48700
2,Regular,1698732017,1698732020,bcycle_boulder_1858,15th & Pearl,15th Street & Pearl Street,-105.27584,40.01872,1,1,...,2023-10-31 12:21:00,537.0,741.0,2023-10-31 00:00:00,1.8935,0,0.0,0,0.03,48700
3,Regular,1698732017,1698732020,bcycle_boulder_1859,11th & Pearl,11th Street & Pearl Street,-105.28116,40.01747,1,1,...,2023-10-31 14:30:00,24.0,870.0,2023-10-31 00:00:00,1.8935,0,0.0,0,0.03,48700
4,Regular,1698732017,1698732020,bcycle_boulder_1860,13th & Spruce,13th Street & Spruce Street,-105.2789,40.01909,1,1,...,2023-10-31 13:39:00,606.0,819.0,2023-10-31 00:00:00,1.8935,0,0.0,0,0.03,48700


In [None]:
# Station Name and ID mapping
all_id_names= merged[['station_id','station_name']].value_counts().reset_index()
id=all_id_names.station_id.values
name= all_id_names.station_name.values

stations_name_mapping=dict()
for i in range(len(id)):
  stations_name_mapping[id[i]]= name[i]
stations_name_mapping

{'bcycle_boulder_2756': 'Broadway & Baseline',
 'bcycle_boulder_2132': 'CU Recreation Center',
 'bcycle_boulder_2760': '13th & College',
 'bcycle_boulder_1872': '19th @ Boulder Creek',
 'bcycle_boulder_1855': 'Folsom & Colorado',
 'bcycle_boulder_2759': '21st & Arapahoe',
 'bcycle_boulder_3318': '18th & Euclid',
 'bcycle_boulder_2875': '35th & Colorado',
 'bcycle_boulder_2198': 'Twenty Ninth Street North',
 'bcycle_boulder_2144': 'Broadway & Euclid',
 'bcycle_boulder_3589': '30th & Marine',
 'bcycle_boulder_1943': 'The Village',
 'bcycle_boulder_1858': '15th & Pearl',
 'bcycle_boulder_2763': '20th & Pearl',
 'bcycle_boulder_1873': '28th & Canyon',
 'bcycle_boulder_2757': '29th & Pearl',
 'bcycle_boulder_2022': '14th & Canyon',
 'bcycle_boulder_2764': 'Folsom & Pearl',
 'bcycle_boulder_3894': 'Center for Community @ Regent Drive',
 'bcycle_boulder_2762': '30th & Glenwood',
 'bcycle_boulder_1871': 'Broadway & Alpine',
 'bcycle_boulder_2765': '28th & Boulder Creek',
 'bcycle_boulder_1860'

# Model Version 1:
* Top 10 stations
* Hours bucketed
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Considered in input
* Only 4 output class (with very low less than 20mins)
* One hot encoding
* Model- RF- Default

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','docks_available','bikes_available','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability','rain',
           'snowfall','snow_depth','visibility','bike_wait_time']
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

#One hot encoding
columns_to_encode = ['station_id','cu_class_status','day_of_week_rnd']
df = pd.get_dummies(df,columns = columns_to_encode)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model = RandomForestClassifier()

# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_Regular']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))


# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
name=[]
accuracy=[]

for i in selected_stations:
  col_name='station_id_'+i
  data= filted_data[filted_data[col_name]==1]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  name.append(stations_name_mapping[i])
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Station_Name':name, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

  model.fit(X_bike, Y_bike)


-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.82

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.62

              precision    recall  f1-score   support

        High       0.00      0.00      0.00      1248
         Low       0.21      0.05      0.08      3539
   Very High       0.58      0.74      0.65      5390
    Very Low       0.67      0.82      0.74     10504

    accuracy                           0.62     20681
   macro avg       0.36      0.40      0.37     20681
weighted avg       0.53      0.62      0.56     20681

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.73

Accuracy Split By Stations


Unnamed: 0,Station_ID,Station_Name,Accuracy
0,bcycle_boulder_3894,Center for Community @ Regent Drive,0.892193
2,bcycle_boulder_2771,Williams Village,0.836408
1,bcycle_boulder_2132,CU Recreation Center,0.822002
4,bcycle_boulder_1855,Folsom & Colorado,0.817949
6,bcycle_boulder_2767,18th & Colorado,0.731738
7,bcycle_boulder_4091,Timber Ridge @ Adams Circle,0.715548
5,bcycle_boulder_2760,13th & College,0.670777
9,bcycle_boulder_2756,Broadway & Baseline,0.637037
8,bcycle_boulder_2144,Broadway & Euclid,0.631307
3,bcycle_boulder_4657,Farrand Field,0.523121


In [None]:
# feature_importance
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_bike.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                           Feature  Importance
4                         hour_rnd    0.291494
5                   temperature_2m    0.123163
10                      visibility    0.111645
0                  docks_available    0.107988
1                  bikes_available    0.093272
21        cu_class_status_No Class    0.028622
22         cu_class_status_Regular    0.025993
16  station_id_bcycle_boulder_2767    0.015834
18  station_id_bcycle_boulder_3894    0.015119
11  station_id_bcycle_boulder_1855    0.014448
14  station_id_bcycle_boulder_2756    0.013462
12  station_id_bcycle_boulder_2132    0.012603
19  station_id_bcycle_boulder_4091    0.012158
3               all_docks_avl_flag    0.011646
6        precipitation_probability    0.011549
20  station_id_bcycle_boulder_4657    0.011500
13  station_id_bcycle_boulder_2144    0.011481
15  station_id_bcycle_boulder_2760    0.011343
25        day_of_week_rnd_Saturday    0.010848
17  station_id_bcycle_boulder_2771    0.009368
29       day_

# Model Version 2:
* Top 10 stations
* Hours bucketed
* bikes_Avl, docks_avl- considered in input
* Only 4 output class (with very low less than 20mins)
* One hot encoding
* Model- RF- Default

----
* snow_depth, snowfall, rain- Not considered in input

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','docks_available','bikes_available','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

#One hot encoding
columns_to_encode = ['station_id','cu_class_status','day_of_week_rnd']
df = pd.get_dummies(df,columns = columns_to_encode)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model = RandomForestClassifier()

# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_Regular']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))


# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
name=[]
accuracy=[]

for i in selected_stations:
  col_name='station_id_'+i
  data= filted_data[filted_data[col_name]==1]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  name.append(stations_name_mapping[i])
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Station_Name':name, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)
  model.fit(X_bike, Y_bike)


-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.82

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.62

              precision    recall  f1-score   support

        High       0.05      0.00      0.01      1248
         Low       0.24      0.07      0.11      3539
   Very High       0.59      0.74      0.66      5390
    Very Low       0.67      0.81      0.73     10504

    accuracy                           0.62     20681
   macro avg       0.39      0.41      0.38     20681
weighted avg       0.54      0.62      0.56     20681

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.73

Accuracy Split By Stations


Unnamed: 0,Station_ID,Station_Name,Accuracy
0,bcycle_boulder_3894,Center for Community @ Regent Drive,0.892193
2,bcycle_boulder_2771,Williams Village,0.822878
1,bcycle_boulder_2132,CU Recreation Center,0.822002
4,bcycle_boulder_1855,Folsom & Colorado,0.803846
6,bcycle_boulder_2767,18th & Colorado,0.739295
7,bcycle_boulder_4091,Timber Ridge @ Adams Circle,0.704947
8,bcycle_boulder_2144,Broadway & Euclid,0.666235
5,bcycle_boulder_2760,13th & College,0.665845
9,bcycle_boulder_2756,Broadway & Baseline,0.646914
3,bcycle_boulder_4657,Farrand Field,0.526012


In [None]:
# feature_importance
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_bike.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                           Feature  Importance
4                         hour_rnd    0.299315
5                   temperature_2m    0.122362
7                       visibility    0.111403
0                  docks_available    0.108148
1                  bikes_available    0.090648
18        cu_class_status_No Class    0.029154
19         cu_class_status_Regular    0.027066
13  station_id_bcycle_boulder_2767    0.016604
15  station_id_bcycle_boulder_3894    0.015166
8   station_id_bcycle_boulder_1855    0.014418
11  station_id_bcycle_boulder_2756    0.013289
9   station_id_bcycle_boulder_2132    0.012974
6        precipitation_probability    0.012171
17  station_id_bcycle_boulder_4657    0.012032
16  station_id_bcycle_boulder_4091    0.011860
12  station_id_bcycle_boulder_2760    0.011658
10  station_id_bcycle_boulder_2144    0.011541
22        day_of_week_rnd_Saturday    0.011479
3               all_docks_avl_flag    0.011424
26       day_of_week_rnd_Wednesday    0.009724
14  station_i

# Model Version 3:
* Top 10 stations
* Hours bucketed
* Only 4 output class (with very low less than 20mins)
* One hot encoding
* Model- RF- Default

----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

#One hot encoding
columns_to_encode = ['station_id','cu_class_status','day_of_week_rnd']
df = pd.get_dummies(df,columns = columns_to_encode)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model = RandomForestClassifier()

# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_Regular']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))


# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
name=[]
accuracy=[]

for i in selected_stations:
  col_name='station_id_'+i
  data= filted_data[filted_data[col_name]==1]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  name.append(stations_name_mapping[i])
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Station_Name':name, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

  model.fit(X_bike, Y_bike)


-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.77

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.63

              precision    recall  f1-score   support

        High       0.08      0.00      0.01      1248
         Low       0.23      0.04      0.08      3539
   Very High       0.58      0.78      0.66      5390
    Very Low       0.68      0.82      0.74     10504

    accuracy                           0.63     20681
   macro avg       0.39      0.41      0.37     20681
weighted avg       0.54      0.63      0.56     20681

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.73

Accuracy Split By Stations


Unnamed: 0,Station_ID,Station_Name,Accuracy
0,bcycle_boulder_3894,Center for Community @ Regent Drive,0.890954
2,bcycle_boulder_2771,Williams Village,0.846248
1,bcycle_boulder_2132,CU Recreation Center,0.822002
4,bcycle_boulder_1855,Folsom & Colorado,0.819231
7,bcycle_boulder_4091,Timber Ridge @ Adams Circle,0.768551
6,bcycle_boulder_2767,18th & Colorado,0.714106
9,bcycle_boulder_2756,Broadway & Baseline,0.665432
5,bcycle_boulder_2760,13th & College,0.653514
8,bcycle_boulder_2144,Broadway & Euclid,0.620957
3,bcycle_boulder_4657,Farrand Field,0.488439


In [None]:
# feature_importance
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_bike.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                           Feature  Importance
2                         hour_rnd    0.350279
3                   temperature_2m    0.134708
5                       visibility    0.123613
1               all_docks_avl_flag    0.041127
16        cu_class_status_No Class    0.036927
17         cu_class_status_Regular    0.030734
11  station_id_bcycle_boulder_2767    0.023716
6   station_id_bcycle_boulder_1855    0.022066
15  station_id_bcycle_boulder_4657    0.020877
7   station_id_bcycle_boulder_2132    0.020387
9   station_id_bcycle_boulder_2756    0.020282
13  station_id_bcycle_boulder_3894    0.020226
14  station_id_bcycle_boulder_4091    0.018886
12  station_id_bcycle_boulder_2771    0.018181
10  station_id_bcycle_boulder_2760    0.018165
8   station_id_bcycle_boulder_2144    0.017971
4        precipitation_probability    0.013834
20        day_of_week_rnd_Saturday    0.012819
24       day_of_week_rnd_Wednesday    0.010282
23         day_of_week_rnd_Tuesday    0.009459
22        day

# Model Version 4:
* Top 10 stations
* Hours bucketed
* Only 4 output class (with very low less than 20mins)
* Model- RF- Default

----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input
* Label encoding

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])
df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model = RandomForestClassifier()

# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)
  model.fit(X_bike, Y_bike)


-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.77

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.63

              precision    recall  f1-score   support

        High       0.22      0.01      0.02      1248
         Low       0.20      0.04      0.07      3539
   Very High       0.58      0.80      0.67      5390
    Very Low       0.69      0.82      0.75     10504

    accuracy                           0.63     20681
   macro avg       0.42      0.42      0.38     20681
weighted avg       0.55      0.63      0.57     20681

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.74

Accuracy Split By Stations


Unnamed: 0,Station_ID,Accuracy
8,7,0.890954
5,6,0.846248
3,1,0.822002
4,0,0.819231
9,8,0.736749
6,5,0.7267
0,4,0.675709
2,2,0.65718
1,3,0.650617
7,9,0.533237


In [None]:
# feature_importance
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_bike.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                     Feature  Importance
2                   hour_rnd    0.331614
8         station_id_encoded    0.304359
3             temperature_2m    0.103797
5                 visibility    0.092691
6    cu_class_status_encoded    0.067380
7    day_of_week_rnd_encoded    0.045851
1         all_docks_avl_flag    0.037662
4  precipitation_probability    0.011692
0         all_bikes_avl_flag    0.004955


# Model Version 5:
* Hours bucketed
* Only 4 output class (with very low less than 20mins)
* Model- RF- Default

----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input
* Label encoding
* Only C4C station selected

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894'] ## 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])
df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model = RandomForestClassifier()

# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)


# feature_importance
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_bike.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

  model.fit(X_bike, Y_bike)


-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.78

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.74

              precision    recall  f1-score   support

        High       0.00      0.00      0.00        78
         Low       0.28      0.04      0.07       272
   Very High       0.48      0.78      0.59       271
    Very Low       0.83      0.90      0.86      1580

    accuracy                           0.74      2201
   macro avg       0.40      0.43      0.38      2201
weighted avg       0.69      0.74      0.70      2201

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.89

Accuracy Split By Stations


Unnamed: 0,Station_ID,Accuracy
0,0,0.890954


                     Feature  Importance
2                   hour_rnd    0.336151
3             temperature_2m    0.197186
5                 visibility    0.179413
6    cu_class_status_encoded    0.117040
7    day_of_week_rnd_encoded    0.099965
1         all_docks_avl_flag    0.043194
4  precipitation_probability    0.026334
0         all_bikes_avl_flag    0.000717
8         station_id_encoded    0.000000


# Model Version 6:
* Hours bucketed
* Model- RF- Default

----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input
* Label encoding
* Only C4C station selected
* Only 4 output class (with very low less than 15mins)

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894'] ## 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 15:
        return "Very Low"
    elif 15 <= time < 30:
        return "Low"
    elif 30 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])
df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model = RandomForestClassifier()

# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)


# feature_importance
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_bike.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)
  model.fit(X_bike, Y_bike)


-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.71

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.61

              precision    recall  f1-score   support

        High       0.12      0.09      0.10       175
         Low       0.25      0.02      0.03       432
   Very High       0.42      0.72      0.53       271
    Very Low       0.71      0.85      0.77      1323

    accuracy                           0.61      2201
   macro avg       0.38      0.42      0.36      2201
weighted avg       0.54      0.61      0.54      2201

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.77

Accuracy Split By Stations


Unnamed: 0,Station_ID,Accuracy
0,0,0.774473


                     Feature  Importance
2                   hour_rnd    0.322781
3             temperature_2m    0.214176
5                 visibility    0.188305
6    cu_class_status_encoded    0.105807
7    day_of_week_rnd_encoded    0.094983
1         all_docks_avl_flag    0.045755
4  precipitation_probability    0.027211
0         all_bikes_avl_flag    0.000982
8         station_id_encoded    0.000000


# Model Version 7:
* Hours bucketed
* Only 4 output class (with very low less than 20mins)
* Model- RF- Default

----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input
* Label encoding
* Only Farrand Field station selected


In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_4657']
#['bcycle_boulder_3894' 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']

df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])
df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model = RandomForestClassifier()

# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)


# feature_importance
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_bike.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col + '_encoded'] = label_encoder.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col + '_encoded'] = label_encoder.fit_transform(df[col])
A value is trying 

-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.76

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.53

              precision    recall  f1-score   support

        High       0.00      0.00      0.00       129
         Low       0.16      0.08      0.10       317
   Very High       0.61      0.69      0.64       905
    Very Low       0.51      0.64      0.57       735

    accuracy                           0.53      2086
   macro avg       0.32      0.35      0.33      2086
weighted avg       0.47      0.53      0.50      2086

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.53

Accuracy Split By Stations


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Station_ID,Accuracy
0,0,0.531792


                     Feature  Importance
2                   hour_rnd    0.340071
3             temperature_2m    0.226311
5                 visibility    0.173070
7    day_of_week_rnd_encoded    0.100012
6    cu_class_status_encoded    0.073257
1         all_docks_avl_flag    0.059467
4  precipitation_probability    0.024986
0         all_bikes_avl_flag    0.002826
8         station_id_encoded    0.000000


# Model Version 8:
* Top 10 stations
* Hours bucketed
* Only 4 output class (with very low less than 20mins)

----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input
* Label encoding
* Model- XGB - Default

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])
df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# XG Boost Classifier
model = GradientBoostingClassifier() #(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)  # Replace with best_params_gb


# Train the classifier
model.fit(X_bike, Y_bike)

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col + '_encoded'] = label_encoder.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col + '_encoded'] = label_encoder.fit_transform(df[col])
  y = column_or_1d

-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.67

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.65



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        High       0.00      0.00      0.00      1248
         Low       0.06      0.00      0.00      3539
   Very High       0.59      0.84      0.70      5390
    Very Low       0.69      0.85      0.76     10504

    accuracy                           0.65     20681
   macro avg       0.33      0.42      0.37     20681
weighted avg       0.51      0.65      0.57     20681

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.75

Accuracy Split By Stations


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Station_ID,Accuracy
8,7,0.89715
5,6,0.846248
3,1,0.822002
4,0,0.819231
9,8,0.768551
6,5,0.748111
0,4,0.683107
1,3,0.680247
2,2,0.652005
7,9,0.530347


# Model Version 9:
* Top 10 stations
* Hours bucketed
* Only 4 output class (with very low less than 20mins)

----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input
* Label encoding
* Model- XGB - Tune


Best parameter= {'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100}

In [None]:
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894' 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']

df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        #return "Very Low"
        return 0
    elif 20 <= time < 40:
        #return "Low"
        return 1
    elif 40 <= time < 60:
        #return "High"
        return 2
    else:
        #return "Very High"
        return 3

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])
df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Train the classifier
model.fit(X_bike, Y_bike)


# Hyperparameter Tuning
param_grid = {
  'n_estimators': [100, 500],
  'max_depth':range(3,10,2),
  'min_child_weight':range(1,6,2)
}


model = XGBClassifier() #(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)  # Replace with best_params_gb
grid = GridSearchCV(estimator=model, param_grid= param_grid, scoring='accuracy', verbose = 10, n_jobs = 1)
grid.fit(X_bike, Y_bike)

print("Best Parameter:", grid.best_params_)
print("Model After Tuning:",grid.best_estimator_)
print("Best Score", grid.best_score_)

model= grid

# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training)-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))
#print('Confusion Matrix')
#print(confusion_matrix(y_test, y_pred))
#print('Classification Report')
#print(classification_report(y_test, y_pred))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
#print('Confusion Matrix')
#print(confusion_matrix(test_y, pred_y))
#print('Classification Report')
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(i)
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col + '_encoded'] = label_encoder.fit_transform(df[col])


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START max_depth=3, min_child_weight=1, n_estimators=100..........
[CV 1/5; 1/24] END max_depth=3, min_child_weight=1, n_estimators=100;, score=0.594 total time=   1.6s
[CV 2/5; 1/24] START max_depth=3, min_child_weight=1, n_estimators=100..........
[CV 2/5; 1/24] END max_depth=3, min_child_weight=1, n_estimators=100;, score=0.238 total time=   1.6s
[CV 3/5; 1/24] START max_depth=3, min_child_weight=1, n_estimators=100..........
[CV 3/5; 1/24] END max_depth=3, min_child_weight=1, n_estimators=100;, score=0.255 total time=   1.5s
[CV 4/5; 1/24] START max_depth=3, min_child_weight=1, n_estimators=100..........
[CV 4/5; 1/24] END max_depth=3, min_child_weight=1, n_estimators=100;, score=0.327 total time=   1.6s
[CV 5/5; 1/24] START max_depth=3, min_child_weight=1, n_estimators=100..........
[CV 5/5; 1/24] END max_depth=3, min_child_weight=1, n_estimators=100;, score=0.582 total time=   1.6s
[CV 1/5; 2/24] START ma

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Station_ID,Accuracy
4,5,0.846248
3,0,0.819231
7,6,0.768551
5,4,0.740554
0,3,0.683107
1,2,0.680247
2,1,0.652005
6,7,0.530347


# Model Version 10:
* Top 10 stations
* Hours bucketed
* Only 4 output class (with very low less than 20mins)
----
* bikes_Avl, docks_avl, snow_depth, snowfall, rain- Not considered in input
* Label encoding
* Model- RF - Tuned with max number of parameters


Best parameter= {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

Final Model= RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200)

In [None]:
# Data
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding and creating a dictionary for station_id enocder mapping
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])

mapping= df[['station_id','station_id_encoded']].value_counts().reset_index()
id= mapping.station_id.values
encoded= mapping.station_id_encoded.values
station_dict=dict()
for i in range(len(id)):
  station_dict[encoded[i]]= id[i]

df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
train_data= df[df['month_rnd'] < 12].drop(columns= ['month_rnd','bike_wait_time'])
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])


X_bike= train_data.drop(columns= ['wait_time'])
Y_bike = train_data[['wait_time']]

# Split train data into 80:20
  #X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

# December data for evaluation
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]


# Initialize the RandomForestClassifier
model= RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=100)

# Train the classifier
model.fit(X_bike, Y_bike)

'''
# Hyperparameter Tuning
param_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
              }

grid = GridSearchCV(estimator=model, param_grid= param_grid, scoring='accuracy', verbose = 10, n_jobs = -1)
grid.fit(X_bike, Y_bike)

print("Best Parameter:", grid.best_params_)
print("Model After Tuning:",grid.best_estimator_)
print("Best Score", grid.best_score_)

model= grid
'''


# Predict and evaluate the model on test data
print('-------------------Testing on November Data (same data used for training-------------------')
y_pred = model.predict(X_bike)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(Y_bike, y_pred)))

# Predict and evaluate the model on December data
pred_y = model.predict(test_x)

# 1. Test on the entire dataset
print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
print(classification_report(test_y, pred_y))

# 2. Test on the popular times dataset
test_data['pred']= pred_y

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print("-------------------Testing on December data's Popular Time-------------------")
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
name=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(station_dict[i])
  name.append(stations_name_mapping[station_dict[i]])
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Station_Name':name, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)
  model.fit(X_bike, Y_bike)


-------------------Testing on November Data (same data used for training)-------------------

Accuracy: 0.69

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.66



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        High       0.00      0.00      0.00      1248
         Low       0.00      0.00      0.00      3539
   Very High       0.61      0.83      0.71      5390
    Very Low       0.69      0.87      0.77     10504

    accuracy                           0.66     20681
   macro avg       0.32      0.43      0.37     20681
weighted avg       0.51      0.66      0.57     20681

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.75

Accuracy Split By Stations


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Station_ID,Station_Name,Accuracy
8,bcycle_boulder_3894,Center for Community @ Regent Drive,0.89715
5,bcycle_boulder_2771,Williams Village,0.846248
3,bcycle_boulder_2132,CU Recreation Center,0.822002
4,bcycle_boulder_1855,Folsom & Colorado,0.819231
9,bcycle_boulder_4091,Timber Ridge @ Adams Circle,0.768551
6,bcycle_boulder_2767,18th & Colorado,0.760705
0,bcycle_boulder_2760,13th & College,0.683107
1,bcycle_boulder_2756,Broadway & Baseline,0.680247
2,bcycle_boulder_2144,Broadway & Euclid,0.652005
7,bcycle_boulder_4657,Farrand Field,0.530347


In [None]:
pickle.dump(model, open('/content/drive/MyDrive/CSCI 5502 Data Mining/01 Project/Model Pickle Files/Best_RF_Model.pkl', 'wb'))