In [None]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler,StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error, mean_squared_log_error

In [None]:
#loading dataset
submission = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
train_data = pd.read_csv('../input/bike-sharing-demand/train.csv')
test_data = pd.read_csv('../input/bike-sharing-demand/test.csv')

In [None]:
print('Train Shape: ', train_data.shape)
print('Test Shape: ', test_data.shape)

In [None]:
train_data.sample(5)

In [None]:
X = train_data.iloc[:, 0:9]
Y = train_data['count']

print('Train X Shape: ', X.shape)
print('Train Y Shape: ', Y.shape)
print('Test Shape: ', test_data.shape)

In [None]:
#check missing value
train_data.isna().sum(axis=0)

# Explority analysis

In [None]:
sns.displot(Y, kde=True)

In [None]:
sns.displot(np.log(Y), kde=True)

In [None]:
sns.histplot(X.season, bins=4)

In [None]:
sns.displot(X.temp, kde=True)

In [None]:
sns.displot(X.atemp, kde=True)

In [None]:
sns.displot(X.windspeed, kde=True)

In [None]:
sns.displot(X.humidity, kde=True)

# Preprocessing & Feature Engineering with Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import calendar
from datetime import datetime

class ProcessDateTime(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print('Transforming datetime...')
        
        x_copy = X.copy()
        x_copy['month'] = x_copy.datetime.apply(lambda x : calendar.month_name[datetime.strptime(x,"%Y-%m-%d %H:%M:%S").weekday()])
        x_copy['weekday'] = x_copy.datetime.apply(lambda x : calendar.day_name[datetime.strptime(x,"%Y-%m-%d %H:%M:%S").weekday()])
        x_copy['hour'] = x_copy.datetime.apply(lambda x : datetime.strptime(x,"%Y-%m-%d %H:%M:%S").hour)
        x_copy['minute'] = x_copy.datetime.apply(lambda x : datetime.strptime(x,"%Y-%m-%d %H:%M:%S").minute)
        x_copy = x_copy.drop(['datetime'], axis=1)
        
        return x_copy

In [None]:
pipeline = Pipeline([
    ('datetime', ProcessDateTime())
])

pipeline.fit_transform(X)

In [None]:
class ProcessSeasonWeather(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print('Transforming season and weather...')
        x_copy = X.copy()
        x_copy['season'] = x_copy['season'].map({
            1: 'Spring',
            2: 'Summer',
            3: 'Fall',
            4: 'Winter'
        })
        x_copy['weather'] = x_copy['weather'].map({
            1: "Clear+FewClouds+PartlyCloudy,PartlyCloudy",
            2: "Mist+Cloudy,Mist+BrokenClouds,Mist+FewClouds,Mist",
            3: "LightSnow,LightRain+Thunderstorm+ScatteredClouds,LightRain+ScatteredClouds",
            4: "HeavyRain+IcePallets+Thunderstorm+Mist,Snow+Fog" 
        })
        return x_copy

In [None]:
pipeline = Pipeline([
    ('datetime', ProcessDateTime()),
    ('seasonweather', ProcessSeasonWeather())
])

In [None]:
pipeline.fit_transform(X)

In [None]:
class DummyEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print('Dummy encoding...')
        x_copy = X.copy()
        x_copy = pd.get_dummies(x_copy)
        return x_copy

    
class RemoveFeature(BaseEstimator, TransformerMixin):
    def __init__(self, features=[]):
        self._features = features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print('Removing features...')
        x_copy = X.copy()
        for f in self._features:
            if f in x_copy.columns:
                x_copy = x_copy.drop([f], axis=1)
        return x_copy

In [None]:
pipeline = Pipeline([
    ('datetime', ProcessDateTime()),
    ('seasonweather', ProcessSeasonWeather()),
    ('dummyencode', DummyEncoding()),
    ('removefeature', RemoveFeature(features=['windspeed']))
])

In [None]:
pipeline.fit_transform(X)


In [None]:
pipeline = Pipeline([
    ('datetime', ProcessDateTime()),
    ('seasonweather', ProcessSeasonWeather()),
    ('dummyencode', DummyEncoding()),
    ('removefeature', RemoveFeature(features=['windspeed'])),
    ('scaler', StandardScaler())
])

In [None]:
pipeline.fit_transform(X)

In [None]:
pipeline = Pipeline([
    ('datetime', ProcessDateTime()),
    ('seasonweather', ProcessSeasonWeather()),
    ('dummyencode', DummyEncoding()),
    ('removefeature', RemoveFeature(['windspeed'])),
    ('scaler', MinMaxScaler())
])

pipeline.fit(X)
X = pipeline.transform(X)
X_test = pipeline.transform(test_data)

In [None]:
print(X.shape)
print(X_test.shape)

In [None]:
pd.DataFrame(X)

In [None]:
#Modeling
lr = LinearRegression()
sgd = SGDRegressor()
rr = Ridge()
ls = Lasso()
en = ElasticNet()

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
#crossvalidation
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=27)

grid_ridge_lasso = {
    'alpha': np.arange(0, 1, 0.05)
}

grid_elastic = {
    'alpha': np.arange(0, 1, 0.05),
    'l1_ratio': np.arange(0, 1, 0.05)
}

lr_score = cross_val_score(lr, X, np.log(Y+0.0001), cv=cv, scoring='neg_mean_squared_log_error')
sgd_score = cross_val_score(sgd, X, np.log(Y+0.0001), cv=cv, scoring='neg_mean_squared_log_error')

rr_search = GridSearchCV(rr, grid_ridge_lasso, cv=cv, scoring='neg_mean_squared_log_error')
rr_score = rr_search.fit(X, np.log(Y+0.0001))

ls_search = GridSearchCV(ls, grid_ridge_lasso, cv=cv, scoring='neg_mean_squared_log_error')
ls_score = ls_search.fit(X, np.log(Y+0.0001))

en_search = GridSearchCV(en, grid_elastic, cv=cv, scoring='neg_mean_squared_log_error')
en_score = en_search.fit(X, np.log(Y+0.0001))

In [None]:
print(np.mean(lr_score))
print(np.mean(sgd_score))

print(rr_score.best_score_)
print(ls_score.best_score_)
print(en_score.best_score_)

In [None]:
np.exp(rr_score.best_estimator_.predict(X_test))

In [None]:
predictions = np.exp(rr_score.best_estimator_.predict(X_test))
predictions = predictions.astype('int')

In [None]:
predictions

In [None]:
pd.DataFrame({
    'datetime': test_data.datetime,
    'count': predictions
})
submission.to_csv('/kaggle/working/submission.csv', index=False)