In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from scipy.stats import skew
from sklearn.impute import KNNImputer
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA

In [None]:
figure_size = [20,20]
def get_aca(size=None):
    if size is None:
        size = [20, 20]
    fig = plt.figure(figsize=size,)
    return fig.gca()

plt.style.use('seaborn')

In [None]:
train_df = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")
test_df = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")
concat_list = [train_df,test_df]
bike_share_df = pd.concat(concat_list,axis=0,ignore_index=True)

In [None]:
bike_share_df.info()

In [None]:
bike_share_df

In [None]:
bike_share_df.describe()

In [None]:
"""split the date into day, month and year"""
bike_share_df['datetime'] = pd.to_datetime(bike_share_df["datetime"],format="%Y-%m-%d %H")
bike_share_df['hour'] = bike_share_df['datetime'].apply(lambda date: date.strftime("%H")).astype("int64")
bike_share_df['day'] = bike_share_df['datetime'].apply(lambda date: date.strftime("%d")).astype("int64")
bike_share_df['month'] = bike_share_df['datetime'].apply(lambda date: date.strftime("%m")).astype("int64")
bike_share_df['year'] = bike_share_df['datetime'].apply(lambda date: date.strftime("%Y")).astype("int64")
bike_share_df['weekday'] = bike_share_df['datetime'].apply(lambda date: date.weekday()).astype("int64")
bike_share_df['yearWeek'] = bike_share_df['datetime'].apply(lambda date: date.isocalendar()[1]).astype("int64")

bike_share_df.drop(columns="datetime",inplace=True)

In [None]:
bike_share_df.isna().sum().plot.bar()

In [None]:
bike_share_df.isnull().sum().plot.bar()
"""no null or na values"""

In [None]:
"""No duplication in the data"""
bike_share_df.duplicated().value_counts()

In [None]:
categorical_features = ["season","holiday","workingday","weather","hour","yearWeek"]
numerical_features = list(set(bike_share_df.columns.to_list()).difference(categorical_features))
discrete_features = [col for col in bike_share_df[numerical_features] if len(bike_share_df[col].value_counts())<=10]
continuous_features = [col for col in bike_share_df[numerical_features] if len(bike_share_df[col].value_counts())>10]

print("The categorical feature",categorical_features," ,their number:",len(categorical_features))
print("The numerical feature",numerical_features," ,their number:",len(numerical_features))
print("The discrete feature",discrete_features," ,their number:",len(discrete_features))
print("The continuous feature",continuous_features," ,their number:",len(continuous_features))

## Data Exploration

In [None]:
heatmap_mask = np.triu(bike_share_df[numerical_features].corr())
sns.heatmap(data=bike_share_df[numerical_features].corr(),mask=heatmap_mask,annot=True,ax=get_aca(),cmap="coolwarm")

In [None]:
sns.pairplot(data=bike_share_df[numerical_features],corner=True)
"""skewed target"""

In [None]:
bike_share_df.hist(bins=100,ax=get_aca());

In [None]:
sns.lineplot(data=bike_share_df,x="hour",y="casual",hue="holiday")

In [None]:
sns.lineplot(data=bike_share_df,x="hour",y="registered",hue="holiday")

In [None]:
sns.lineplot(data=bike_share_df,x="hour",y="casual",hue="workingday")

In [None]:
sns.lineplot(data=bike_share_df,x="hour",y="registered",hue="workingday")

In [None]:
sns.lineplot(data=bike_share_df,x="hour",y="registered",color="r")
sns.lineplot(data=bike_share_df,x="hour",y="casual",color="b")

In [None]:
fig = plt.figure(figsize=figure_size)
for num, col in zip(list(range(1,len(categorical_features)+1)) , categorical_features):
    ax = fig.add_subplot(10,2,num)
    sns.boxplot(data=bike_share_df,x=col,y="count",ax=ax)

In [None]:
fig = plt.figure(figsize=figure_size)
for num, col in zip(list(range(1,len(discrete_features)+1)) , discrete_features):
    ax = fig.add_subplot(10,2,num)
    sns.boxplot(data=bike_share_df,x=col,y="count",ax=ax)

* The count of bike sharing is least for spring
* The number of bike shares increased in 2019
* The count values increases in summer months
* The count values ars less during holidays
* The hour hase 8 and 18 hase a peak

In [None]:
"""for zero classification model"""
bike_share_df["nonZeroClass"] = bike_share_df["count"]!=0

Fixing the skewness in the data using log1p

In [None]:
sns.displot(data=bike_share_df,x="count",kind="kde",fill=True)

In [None]:
sns.ecdfplot(data=bike_share_df,x="count")
print("the skewness in target",skew(bike_share_df["count"]))

In [None]:
bike_share_df["log_count"] = np.log1p(bike_share_df["count"].to_numpy().reshape(-1,1))

In [None]:
sns.displot(data=bike_share_df,x="log_count",kind="kde",fill=True)

In [None]:
sns.ecdfplot(data=bike_share_df,x="log_count")
print("the skewness in target",skew(bike_share_df["log_count"]))

In [None]:
"""impute zero windSpeed and humidity using knnImputer"""
casual = bike_share_df["casual"]
registered = bike_share_df["registered"]
count = bike_share_df["count"]

bike_share_df["windspeed"] = bike_share_df["windspeed"].replace({0:np.nan})
bike_share_df["humidity"] = bike_share_df["humidity"].replace({0:np.nan})
imputer = KNNImputer(n_neighbors=10,)
bike_share_df = pd.DataFrame(imputer.fit_transform(bike_share_df),columns=bike_share_df.columns)

bike_share_df["casual"] = casual
bike_share_df["registered"] = registered
bike_share_df["count"] = count

In [None]:
bike_share_df.info()

In [None]:
drop_list = [
    "casual",
    "registered",
    "count",
    # "log_count",
    "nonZeroClass",
]

In [None]:
categorical_features.remove("holiday")
categorical_features.remove("workingday")
bike_share_df = bike_share_df.astype(dict(zip(categorical_features,["category"]*len(categorical_features))))

In [None]:
bike_share_df = bike_share_df.astype(dict(zip(discrete_features,["category"]*len(discrete_features))))

In [None]:
bike_share_df.info()

In [None]:
# bike_share_df = pd.get_dummies(bike_share_df)

In [None]:
train_new_df = bike_share_df[bike_share_df["count"].isna()==False].drop(columns=drop_list)
test_new_df = bike_share_df[bike_share_df["count"].isna()==True].drop(columns=drop_list)

In [None]:
# train_new_df = train_new_df.filter(selection_list,axis=1)
# test_new_df = test_new_df.filter(selection_list,axis=1)

In [None]:
X_train = train_new_df.drop(columns=["log_count"])
Y_train = train_new_df["log_count"]

In [None]:
X_test = test_new_df.drop(columns="log_count")

In [None]:
X_train.head()

### General model

In [None]:
"""model pipline"""
hgbr = HistGradientBoostingRegressor()
hgbr_pipline = Pipeline([("rodbust_scaler",RobustScaler()),("hgbr",hgbr)])

rf = RandomForestRegressor()
rf_pipline = Pipeline([("rodbust_scaler",RobustScaler()),("rf",rf)])

In [None]:
class Mypipeline(Pipeline):
    @property
    def coef_(self):
        return self._final_estimator.coef_
    @property
    def feature_importances_(self):
        return self._final_estimator.feature_importances_

pipeline = Mypipeline([("rodbust_scaler",RobustScaler()),("rf",rf)])

In [None]:
"""feature selection"""
feature_selection = RFECV(estimator=pipeline,step=1,verbose=2,n_jobs=-1,scoring="neg_mean_squared_log_error").fit(X_train,Y_train)
selected_feature_df = pd.DataFrame()
selected_feature_df["features"] = X_train.columns
selected_feature_df["supported"] = feature_selection.support_
selected_feature_df["ranking"] = feature_selection.ranking_

In [None]:
selected_feature_df.iloc[7,1] = False

In [None]:
selected_feature_df

In [None]:
X_train = X_train.filter((selected_feature_df[selected_feature_df["supported"]==True])["features"],axis=1)

In [None]:
X_train.columns

In [None]:
k_fold = KFold(n_splits=30,shuffle=True)
scores = cross_validate(estimator=hgbr_pipline,X=X_train,y=Y_train,scoring=["r2","neg_root_mean_squared_error","neg_mean_squared_log_error"],cv=k_fold,n_jobs=-1,verbose=2)

In [None]:
def print_scores(scores):
    print("Model Evaluation:",
        "\n\tTime to fit:",np.mean(scores["fit_time"]),
        "\n\tScore time:",np.mean(scores["score_time"]),
        "\n\tR2 score:",np.mean(scores["test_r2"]),
        "\n\tRMSE score:",np.mean(-1*scores["test_neg_root_mean_squared_error"]),
        "\n\tRMSLE score:", np.mean(np.sqrt(-1*scores["test_neg_mean_squared_log_error"])))

In [None]:
print_scores(scores)

In [None]:
X_test = X_test.filter((selected_feature_df[selected_feature_df["supported"]==True])["features"],axis=1)

In [None]:
hgbr_pipline.fit(X_train,Y_train)
y_test_predict = hgbr_pipline.predict(X_test)
bike_share_prediction = pd.DataFrame()
bike_share_prediction["datetime"] = test_df["datetime"]
bike_share_prediction["count"] = pd.Series(np.expm1(y_test_predict)).astype(int)
bike_share_prediction.to_csv("/kaggle/working/bike_share_prediction.csv",index=False)