## Team Members 
1- Abdelrahman Sayed
2- Sherif ElHabyan
3- Ahmed Mamdouh 

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error 
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import HistGradientBoostingRegressor



In [None]:


df=pd.read_csv('../input/bike-sharing-demand/train.csv')
test=pd.read_csv('../input/bike-sharing-demand/test.csv')


## EDA For Training Data

In [None]:
df.describe()

In [None]:
fig = plt.figure(figsize=(16,30))

columns = ['atemp','humidity','windspeed','casual','registered','count']

fig = plt.figure(figsize=(16,30))



for i in range(len(columns)):
    fig.add_subplot(9, 5, i+1)
    sns.boxplot(y=df[columns[i]])
    plt.tight_layout()
    plt.show()
    


In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
corr_matrix = df.corr()
corr_matrix["count"].sort_values(ascending=False)

In [None]:
ax = sns.heatmap(
    corr_matrix, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(120, 340, n=200),
    square=True
)

## Encode DateTime

In [None]:
df['Date_time'] = df['datetime'].astype('datetime64[ns]')
df['Year']=df.Date_time.dt.year
df['Month']=df.Date_time.dt.month
df['Day']=df.Date_time.dt.day
df['Hour']=df.Date_time.dt.hour

test['Date_time'] = test['datetime'].astype('datetime64[ns]')
test['Year']=test.Date_time.dt.year
test['Month']=test.Date_time.dt.month
test['Day']=test.Date_time.dt.day
test['Hour']=test.Date_time.dt.hour

In [None]:
df.hist(bins=50, figsize=(20,15))


## Conclusion from above visualization 

1) Holiday is dominant 0 Feature
2) Holiday has Week correlation with count

So Drop this column 

In [None]:
df.groupby('Hour')['count'].sum().plot(kind='bar')


In [None]:
df.groupby('Day')['count'].sum().plot(kind='bar')


In [None]:
df.groupby('season')['count'].sum().plot(kind='bar')


In [None]:
df.groupby('temp')['count'].sum().plot(kind='bar',figsize=(15,10))

df.info()

In [None]:
for i in df.columns:
    print(i)
    
    print(df[i].value_counts().sum)
    print("***********************************************")

In [None]:
#windspeed fill zero , atemp drop with holiday

plt.plot(df['temp'],df['atemp'])
plt.show()

In [None]:
df['windspeed'].hist()

## Feature Engineering

In [None]:
df = df.drop(columns=['holiday'])
test = test.drop(columns=['holiday'])

# fill Zero values in wind speed
df['windspeed']=df['windspeed'].replace(0,df['windspeed'].median())
test['windspeed']=test['windspeed'].replace(0,test['windspeed'].median())




import calendar
df['weekday'] = 0
for ind in df.index:
     df['weekday'][ind]= calendar.weekday(df['Year'][ind], df['Month'][ind] , df['Day'][ind])

test['weekday'] = 0
for ind in test.index:
     test['weekday'][ind]= calendar.weekday(test['Year'][ind], test['Month'][ind] , test['Day'][ind])
        
        
        
df['is_night'] = 0
df.loc[(df['Hour'] < 6) | (df['Hour'] > 20), 'is_night'] = 1

test['is_night'] = 0
test.loc[(test['Hour'] < 6) | (test['Hour'] > 20), 'is_night'] = 1


df["weekend"] = df["weekday"].apply(lambda x: 1 if  x == 5 or x == 6 else 0 )
test["weekend"] = test["weekday"].apply(lambda x: 1 if  x == 5 or x == 6 else 0 )




df["day_cos"] = np.cos((df['Hour']) * (2 * np.pi / 24))
df["day_sin"] = np.sin((df['Hour']) * (2 * np.pi / 24))
df["month_cos"] = np.cos((df['Month']) * (2 * np.pi / 12))
df["month_sin"] = np.sin((df['Month']) * (2 * np.pi / 12))

df["weekday_sin"] = np.sin((df['weekday']) * (2 * np.pi / 7))
df["weekday_cos"] = np.cos((df['weekday']) * (2 * np.pi / 7))



test["day_cos"] = np.cos((test['Hour']) * (2 * np.pi / 24))
test["day_sin"] = np.sin((test['Hour']) * (2 * np.pi / 24))
test["month_cos"] = np.cos((test['Month']) * (2 * np.pi / 12))
test["month_sin"] = np.sin((test['Month']) * (2 * np.pi / 12))
test["weekday_sin"] = np.sin((test['weekday']) * (2 * np.pi / 7))
test["weekday_cos"] = np.cos((test['weekday']) * (2 * np.pi / 7))




Q1_windspeed = df['windspeed'].quantile(0.25)
Q3_windspeed = df['windspeed'].quantile(0.75)
IQR1 = Q3_windspeed - Q1_windspeed

Q1_causal = df['casual'].quantile(0.25)
Q3_causal = df['casual'].quantile(0.75)
IQR2 = Q3_causal - Q1_causal

Q1_registered = df['registered'].quantile(0.25)
Q3_registered = df['registered'].quantile(0.75)
IQR3 = Q3_registered - Q1_registered


df.loc[df.windspeed > 16.997900, 'windspeed'] = np.nan
df['windspeed'].fillna(1.5*IQR1,inplace=True)

df.loc[df.casual > 49.000000, 'casual'] = np.nan
df['casual'].fillna(1.5*IQR2,inplace=True)

df.loc[df.registered >222.000000, 'registered'] = np.nan
df['registered'].fillna(1.5*IQR3,inplace=True)

## Model

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42,shuffle=True) # Try adding `stratify` here
X_train = train_df.drop(columns=['count'])
y_train = train_df['count']

X_val = val_df.drop(columns=['count'])
y_val = val_df['count']

print(list(train_df.columns))

In [None]:
X_train = X_train [['weekend','is_night','weekday','temp','season', 'workingday', 'weather','humidity', 'windspeed','Year', 'Month', 'Day', 'Hour']]
X_val = X_val [list(X_train.columns)]
X_test = test[list(X_train.columns)]

In [None]:
RegModel=XGBRegressor(learning_rate = 0.07,max_depth = 10, alpha = 6, n_estimators = 350)



y_train = np.log(train_df['count']+1)
#y_val1 = np.log(val_df['count']+1)



#Fit the trained model
RegModel.fit(X_train,y_train)



train_predict=RegModel.predict(X_train)
train_predict=np.exp(train_predict)
Error1= np.sqrt(mean_squared_log_error (np.exp(y_train) , train_predict ))
print('RMSLE Training:' + str(Error1))


predict= RegModel.predict(X_val)
predict=np.exp(predict)
Error= np.sqrt(mean_squared_log_error (y_val , predict ))
print('RMSLE Validation:' + str(Error))

In [None]:
y_test_predicted = RegModel.predict(X_test)


y_test_predicted=np.exp(y_test_predicted)
# You should update/remove the next line once you change the features used for trainin

test['count'] = y_test_predicted.astype(int)
test[['datetime', 'count']].to_csv('/kaggle/working/submission.csv', index=False)
