# Bike sharing
ITI Alex G1

* Nouhrhan Elaraby
* Lujain Kotb
* Ibrahim Yaseen
* Mohamed Abd Elrazik
* Sondos Omar

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import KFold
import gc
import os
import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
# Loading Data

pd.set_option('display.max_columns', None)
df_train = pd.read_csv('../input/bike-sharing-demand/train.csv')
df_train.shape

In [None]:
df_train.describe()

In [None]:
df_train.isnull().sum()

In [None]:
melt = pd.melt(df_train,value_vars=df_train.drop(['datetime','casual','registered','count','holiday','workingday', 'weather','season'], axis =1).columns, 
               var_name =['feature'],value_name = 'value')

plt.figure(figsize = (10,8))
sns.set(font_scale=1.4)
sns.boxplot(data = melt, y = 'feature',x='value', palette="Blues_r").set(ylabel = 'feature',title = 'boxplot');

In [None]:
def time_transform(df):
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['month'] = df['datetime'].dt.month
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.dayofweek
    
    return df

df_train = time_transform(df_train)

In [None]:
#Comparison hourly rental for working days with days off.

fig, ax = plt.subplots(2,1,figsize =(12,10))
sns.set(font_scale=1.2)

sns.barplot(x = df_train['hour'].unique(),
            y= df_train['count'].loc[df_train['workingday']==0].groupby(df_train['hour']).mean(),
           ax = ax[0], palette= 'PuBu').set(title = 'Average bikes hourly rental on weekends and holidays',
                                            xlabel = 'hours', ylabel = 'average number of rental ')
sns.barplot(x = df_train['hour'].unique(),
            y= df_train['count'].loc[df_train['workingday']==1].groupby(df_train['hour']).mean(),
           ax = ax[1], palette='RdPu').set(title='Average bikes hourly rental on working days',
                                          xlabel = 'hours',ylabel = 'average number of rental');


In [None]:
user_table = pd.melt(df_train, id_vars = ['month','workingday'],value_vars = df_train[['casual','registered']],
               var_name = 'user', value_name = 'count')

plot = sns.factorplot(x = 'month',y = 'count',data = user_table,col = 'workingday',
                      hue = 'user',height = 5);

In [None]:
#heatmeap
plt.figure(figsize=(14,8))
sns.set(font_scale=0.9)
sns.heatmap(data = df_train.corr(), annot=True, linewidths=0.5);

In [None]:
fix = plt.subplots(figsize = (6,4))
plt.title("Average bikes rent depends on a weather")
plt.ylabel('weather')
sns.barplot(x = df_train['weather'].unique(),y = df_train['count'].groupby(df_train['weather']).mean());

In [None]:
features = df_train.drop(columns =['datetime','casual','registered','count','holiday'])
target = np.log1p(df_train['count'])

In [None]:
categorical_features = ['workingday','weather', 'season']

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}
kf = KFold(n_splits=2)
models = []
for train_index,test_index in kf.split(features):
    
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,
                             categorical_feature=categorical_features, 
                             free_raw_data=False)
    
    d_test = lgb.Dataset(test_features, label=test_target,
                         categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, 
                         valid_sets=[d_training,d_test], 
                         verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()

In [None]:
for model in models:
    lgb.plot_importance(model)
    plt.show()

In [None]:
df_test = pd.read_csv('../input/bike-sharing-demand/test.csv')
df_test = time_transform(df_test)
test = df_test.drop(columns=['datetime','holiday'])

In [None]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(test, num_iteration=model.best_iteration)) / len(models)
    else:
        results += np.expm1(model.predict(test, num_iteration=model.best_iteration)) / len(models)
    del model
    gc.collect()

In [None]:
result_df = pd.DataFrame({'datetime':df_test['datetime'],'count':results})
result_df.to_csv("submission.csv",index = False)