### Imports & Load Data

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test_df = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

### EDA & Visuals

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
# NANs check
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

- There's `no NaNs`

In [None]:
# Duplicates check
train_df.duplicated().sum()

In [None]:
test_df.duplicated().sum()

- There's `no duplicates`

In [None]:
# Datasets statistics
train_df.describe()

In [None]:
# box plotting casual, registed and count
plt.figure(figsize=(8, 6))
plt.boxplot([train_df['casual'], train_df['registered'], train_df['count']]);

In [None]:
plt.boxplot([np.log1p(train_df['casual']), np.log1p(train_df['registered']), np.log1p(train_df['count'])]);

- Using log 1 positive with casual and registed columns will affect well in modeling as almost no outliers.

In [None]:
# Extracting year, month, day, hour from datetime feature

train_df['datetime'] = pd.to_datetime(train_df['datetime'], format='%Y-%m-%d %H')
test_df['datetime'] = pd.to_datetime(test_df['datetime'], format='%Y-%m-%d %H')

train_df['year'] = train_df['datetime'].dt.year
test_df['year'] = test_df['datetime'].dt.year

train_df['month'] = train_df['datetime'].dt.month
test_df['month'] = test_df['datetime'].dt.month

train_df['day'] = train_df['datetime'].dt.day
test_df['day'] = test_df['datetime'].dt.day

train_df['hour'] = train_df['datetime'].dt.hour
test_df['hour'] = test_df['datetime'].dt.hour

train_df['day-of-week'] = train_df['datetime'].dt.day_name()
test_df['day-of-week'] = test_df['datetime'].dt.day_name()

In [None]:
# feature correlation
plt.figure(figsize=(14, 12))
sns.heatmap(train_df.corr());

In [None]:
train_df.corr()

- Theres `high collinearity` between:
    - `temp` and `atemp`, so we can drop one of them
    - `season` and `month`, so we can drop one of them

In [None]:
# Check mean of bike rented per days of week
plt.figure(figsize=(8,6))
train_df.groupby('day-of-week')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('day-of-week')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Week Days');

- We can notice that `weekend (Saturday and Monday)` has significant difference in mean rents even in casual or registerd ones.

In [None]:
# Check mean of bike rented per months
plt.figure(figsize=(8,6))
train_df.groupby('month')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('month')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Months');

- Mean rents significantly `decreases in January and February`, and `highly increases between May to October`.

In [None]:
# Check mean of bike rented per year
train_df.groupby('year')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('year')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Years');

- Mean rents in `2012 is higher than 2011`

In [None]:
# Check mean of bike rented Over times
plt.figure(figsize=(8,6))
train_df.groupby(['year', 'month'])['casual'].mean().plot(color='crimson');
train_df.groupby(['year', 'month'])['registered'].mean().plot(color='cornflowerblue');
plt.title('Mean of Bikes Rented over Times');

- We notice that over time during year the mean rents differs, so the feature of year and month will affect well in model.

In [None]:
# Check mean of bike rented per hour
plt.figure(figsize=(10, 8))
train_df.groupby('hour')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('hour')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Hours');

- Mean of rents significantly `decreases at midnight till the dawn`
- Registed rents`highly increases at 8 AM and 17, 18 PM`.

In [None]:
# Check mean of bike rented over seasons
train_df.groupby('season')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('season')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Seasons');

Mean rents `increases` in season `Summer and Fall` rather than Winter and Spring

In [None]:
# Check mean of bike rented over workingday
train_df.groupby('workingday')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('workingday')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Seasons');

-  `Casual` rents increases in `No workingdays` when `Registed` rents increases in `workingdays`



In [None]:
# Check mean of bike rented over weather condition
train_df.groupby('weather')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('weather')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Weather Condition');

- Mean rents increases in Clear, Few clouds, Partly cloudy, Partly cloudy weather

In [None]:
# Check mean of bike rented over temperature
plt.figure(figsize=(12, 10))
train_df.groupby('temp')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('temp')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Temperature');

- Mean rents `increases` when the `tempeature increases`

In [None]:
# Check mean of bike rented over windspeed
plt.figure(figsize=(8, 10))
train_df.groupby('windspeed')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('windspeed')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Windspeed');

- this feature won't affect well in modeling as almost all windspeeds has ths same mean rents

In [None]:
# Check mean of bike rented over humidity range
train_df['humidity_range'] = train_df['humidity']/10
train_df['humidity_range'] = train_df['humidity_range'].astype(int)

plt.figure(figsize=(10, 8))
train_df.groupby('humidity_range')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('humidity_range')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Humidity Range');

train_df.drop('humidity_range', axis=1, inplace=True)

- Mean rents `decreases` when the `humidity increases`, except below 10%

### Feature Engineering & Extra EDA

In [None]:
# Extracting weekends
train_df['week_end'] = train_df['day-of-week'].apply(lambda x: 1 if x=='Saturday' or x=='Sunday' else 0)
test_df['week_end'] = test_df['day-of-week'].apply(lambda x: 1 if x=='Saturday' or x=='Sunday' else 0)

In [None]:
# Check mean of bike rented over weekend
train_df.groupby('week_end')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('week_end')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Weekend');

- The feature will affect well in model 

In [None]:
# Extracting rush_hour
train_df['rush_hour'] = train_df['hour'].apply(lambda x: 1 if x == 8 or x == 17 or x == 18 else 0)
test_df['rush_hour'] = test_df['hour'].apply(lambda x: 1 if x == 8 or x == 17 or x == 18 else 0)

In [None]:
# Check mean of bike rented over rush hours
train_df.groupby('rush_hour')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('rush_hour')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Rush Hours');

- The feature will affect very `well` in prediction the `registered` rents `not` the `casual`.

In [None]:
# Extracting sleepy hours
train_df['sleepy_hour'] = train_df['hour'].apply(lambda x: 1 if x in [0, 1, 2, 3, 4, 5] else 0)
test_df['sleepy_hour'] = test_df['hour'].apply(lambda x: 1 if x in [0, 1, 2, 3, 4, 5] else 0)

In [None]:
# Check mean of bike rented over sleepy hours
train_df.groupby('sleepy_hour')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('sleepy_hour')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Sleepy Hours');

- The feature will affect very well in model prediction.

In [None]:
# Extracting improper humidity
train_df['improper_humidity'] = train_df['humidity'].apply(lambda x: 1 if x < 10 or x > 70 else 0)
test_df['improper_humidity'] = test_df['humidity'].apply(lambda x: 1 if x < 10 or x > 70 else 0)

In [None]:
# Check mean of bike rented over improper humidity
train_df.groupby('improper_humidity')['casual'].mean().plot(kind='barh', color='crimson', width=0.3,  position=0);
train_df.groupby('improper_humidity')['registered'].mean().plot(kind='barh', color='cornflowerblue', width=0.3, position=1);
plt.title('Mean of Bikes Rented over Improper Humidity');

- The feature will affect in model prediction.

### Pre-Processing

In [None]:
# Drop high collinearity features
train_df.drop(['atemp', 'season'], axis=1, inplace=True)
test_df.drop(['atemp', 'season'], axis=1, inplace=True)

In [None]:
# Setting datetime as index
train_df = train_df.set_index('datetime')
test_df = test_df.set_index('datetime')

In [None]:
# Label Encoding Day featue

for col in ['day-of-week']:
    train_df[col] = pd.factorize(train_df[col])[0].reshape(-1,1)
    test_df[col] = pd.factorize(test_df[col])[0].reshape(-1,1)

### Modeling

In [None]:
def model_outcome(x, y, model):
    
    # spliiting train and validation data
    x_train, x_valid, y_train, y_valid = train_test_split(x, y , test_size = 0.2, stratify=train_df['month'], random_state = 0)
    
    # fitting model
    model.fit(x_train, y_train)
    
    # prediction train and validation datasets
    train_pred = model.predict(x_train)
    valid_pred = model.predict(x_valid)

    #calulation errors
    print('Training R2 score: {}'.format(r2_score(y_train, train_pred)))
    print('Validation R2 score: {}'.format(r2_score(y_valid, valid_pred)))

    print('Training RMSLE: {}'.format(np.sqrt(np.square(y_train - train_pred).mean())))
    print('Training RMSLE: {}'.format(np.sqrt(np.square(y_valid - valid_pred).mean())))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
x = train_df.drop(['casual', 'registered', 'count'], axis=1)
y_casual = np.log1p(train_df['casual'])
y_registered = np.log1p(train_df['casual'])

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()

print('Casual Rents Prediction Model')
model_outcome(x, y_casual, lr_model)
print('\nRegistered Rents Prediction Model')
model_outcome(x, y_registered, lr_model)

In [None]:
# SVR
from sklearn.svm import SVR
svr_model = SVR(kernel='sigmoid')
    
print('Casual Rents Prediction Model')
model_outcome(x, y_casual, svr_model)
print('\nRegistered Rents Prediction Model')
model_outcome(x, y_registered, svr_model)

In [None]:
# KNN
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=5)
    
print('Casual Rents Prediction Model')
model_outcome(x, y_casual, knn_model)
print('\nRegistered Rents Prediction Model')
model_outcome(x, y_registered, knn_model)

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(max_depth=8)
    
print('Casual Rents Prediction Model')
model_outcome(x, y_casual, dt_model)
print('\nRegistered Rents Prediction Model')
model_outcome(x, y_registered, dt_model)

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=40, max_depth=10, random_state=0)
    
print('Casual Rents Prediction Model')
model_outcome(x, y_casual, rf_model)
print('\nRegistered Rents Prediction Model')
model_outcome(x, y_registered, rf_model)

In [None]:
# Extra Trees Regressor
from sklearn.ensemble import ExtraTreesRegressor
et_model = ExtraTreesRegressor(n_estimators=45, max_depth=11, random_state=0)
    
print('Casual Rents Prediction Model')
model_outcome(x, y_casual, et_model)
print('\nRegistered Rents Prediction Model')
model_outcome(x, y_registered, et_model)

In [None]:
# Extra Trees Regressor
from sklearn.ensemble import GradientBoostingRegressor

params = {'n_estimators': 45, 'max_depth': 9, 'random_state': 0,
          'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gbm_model = GradientBoostingRegressor(**params) 

print('Casual Rents Prediction Model')
model_outcome(x, y_casual, gbm_model)
print('\nRegistered Rents Prediction Model')
model_outcome(x, y_registered, gbm_model)

### Result

In [None]:
test_casual_predict = gbm_model.fit(x, y_casual).predict(test_df)
test_registered_predict = gbm_model.fit(x, y_registered).predict(test_df)

In [None]:
test_casual_predict = np.expm1(test_casual_predict)
test_registered_predict = np.expm1(test_registered_predict)

test_count_predict = test_casual_predict + test_registered_predict

In [None]:
# Saving test predictions to file
output = pd.DataFrame({"datetime": test_df.index, "count" : test_count_predict.astype(int)})
output.to_csv('sampleSubmission.csv', index=False)