In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('../Bike Sharing Demand/bike_data/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [8]:
X = train.drop(['datetime', 'count'], axis=1)
y = train['count']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8164, 10), (8164,), (2722, 10), (2722,))

In [9]:
# knn
model = KNeighborsRegressor()
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.9996457433334117
0.9994762796278718


In [10]:
# random forest
model = RandomForestRegressor()
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.9999542021538383
0.9997511867421761


In [11]:
# gradient boosting
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.9993439892520589
0.9992270403955342


In [15]:
depth_numbers = list(range(1, 15))

depth_list = []
train_score = []
test_score = []

for d in depth_numbers:
    model = RandomForestRegressor(
        max_depth=d,
        n_jobs=4,
        random_state=0
    )
    model.fit(X_train, y_train)

    train_score.append(model.score(X_train, y_train))
    test_score.append(model.score(X_test, y_test))

data = {
    'train': train_score,
    'test': test_score
}

df = pd.DataFrame(data, index=depth_numbers)
df

Unnamed: 0,train,test
1,0.675615,0.68033
2,0.889853,0.885853
3,0.959775,0.958748
4,0.983191,0.982682
5,0.993299,0.992198
6,0.997664,0.997064
7,0.999062,0.998694
8,0.999665,0.999378
9,0.999841,0.99962
10,0.99991,0.999709


In [16]:
max_test = df.loc[df['test'] == df['test'].max()]
max_test

Unnamed: 0,train,test
12,0.999945,0.99975


In [18]:
model = RandomForestRegressor(max_depth=12, random_state=0)
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.999944755505401
0.9997499211957005
