In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
import warnings
import math
warnings.filterwarnings("ignore")

In [None]:
df_orig = pd.read_csv('../input/bike-sharing-demand/train.csv')
df_test_orig = pd.read_csv('../input/bike-sharing-demand/test.csv')
df = df_orig.copy()
df_test = df_test_orig.copy()
df.head()

In [None]:
#We can drop casual, registered
df = df.drop(['casual', 'registered'], axis=1)

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
#Lets drop the dates from datetime and keep time in 24 hr format
rows = df.shape[0]
df['time'] = df['datetime']
for i in range (rows):
    df.iloc[i,10] = int(df.iloc[i,0][11] + df.iloc[i,0][12]) 
    df.iloc[i,0] = int(df.iloc[i,0][5] + df.iloc[i,0][6]) 

In [None]:
df['month'] = df['datetime']
df = df.drop(['datetime'], axis=1)
df['time'] = df['time'].astype('int')
df['month'] = df['month'].astype('int')
df.head()

In [None]:
df.dtypes

### ANALYSING NUMERICAL DATA

In [None]:
plt.scatter(df['temp'], df['count'], color='c', s=2)
plt.title('Temp vs Demand')
plt.xlabel('Temp')
plt.ylabel('Demand')

In [None]:
plt.scatter(df['atemp'], df['count'], color='b', s=2)
plt.title('Feels-like Temp vs Demand')
plt.xlabel('Feels-like Temp')
plt.ylabel('Demand')

In [None]:
plt.scatter(df['humidity'], df['count'], color='y', s=2)
plt.title('Humidity vs Demand')
plt.xlabel('Humidity')
plt.ylabel('Demand')

In [None]:
plt.scatter(df['windspeed'], df['count'], color='m', s=2)
plt.title('Windspeed vs Demand')
plt.xlabel('Windspeed')
plt.ylabel('Demand')

### ANALYSING CATEGORICAL DATA

In [None]:
colors = ['c', 'teal', 'm', 'orange', 'y']
cat = df['time'].unique()
cat_avg = df.groupby('time').mean()['count']
plt.bar(cat, cat_avg, color=colors)
plt.title('Demand at different Time of the day')
plt.xlabel('Time')
plt.ylabel('Demand')
plt.show()

In [None]:
# 1-spring 2-summer 3-fall 4-winter
cat = df['season'].unique()
cat_avg = df.groupby('season').mean()['count']
plt.bar(cat, cat_avg, color=colors)
plt.title('Demand in different Seasons')
plt.xlabel('Season')
plt.ylabel('Demand')
plt.show()

In [None]:
cat = df['holiday'].unique()
cat_avg = df.groupby('holiday').mean()['count']
plt.bar(cat, cat_avg, color=colors)
plt.title('Demand vs Holiday/Non-holiday')
plt.xlabel('Holiday')
plt.ylabel('Demand')
plt.show()
#We can drop this as no significant info

In [None]:
cat = df['workingday'].unique()
cat_avg = df.groupby('workingday').mean()['count']
plt.bar(cat, cat_avg, color=colors)
plt.title('Demand vs Workingday/Non-workingday')
plt.xlabel('Workingday')
plt.ylabel('Demand')
plt.show()
#We can drop this as well

In [None]:
# 1-clear 2-mist 3-light rain 4-heavy rain
cat = df['weather'].unique()
cat_avg = df.groupby('weather').mean()['count']
plt.bar(cat, cat_avg, color=colors)
plt.title('Demand in different Weather conditions')
plt.xlabel('Weather')
plt.ylabel('Demand')
plt.show()

In [None]:
cat = df['month'].unique()
cat_avg = df.groupby('month').mean()['count']
plt.bar(cat, cat_avg, color=colors)
plt.title('Demand in different month')
plt.xlabel('Month')
plt.ylabel('Demand')
plt.show()

In [None]:
df['count'].hist(bins=20)
#This seems like a log-normal distribution. So we will normalise it before training the models

In [None]:
df['count'] = np.log(df['count'])
df['count'].hist(bins=20)
#Now we have a skweed normal distribution

### CORRELATION OF DIFF FEATURES

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(10, 10))
sn.heatmap(corr_matrix, annot=True, linewidths=0.15)
plt.show()
#temp and atemp are highly correlated so we will drop atemp
#holiday and workingday have low correlations with count. So we can drop them as well as we decided earlier
#weather and windspeed also have high correlation. So we will drop windspeed as its corr with count is lesser than that of weather
#time has really high correlation with count. Thats why count seems to be a time-series feature. So check for autocorrelation
#month and season are highly correlated so we will drop month

In [None]:
df = df.drop(['holiday', 'workingday', 'atemp', 'windspeed', 'month'], axis=1)
df.head()

### CHECKING FOR AUTOCORRELATION IN COUNT FEATURE

In [None]:
# #Since count seems to be a time-series type of data.. it can have auto-correlation. 
# #We can see that there is high auto-correlation upto 5 lags. Lets make shifts but till 3 lags only
# temp = pd.to_numeric(df['count'], downcast='float')
# plt.acorr(temp, maxlags=12)

In [None]:
# t_1 = df['count'].shift(+1).to_frame()
# t_1.columns = ['t-1']

# t_2 = df['count'].shift(+2).to_frame()
# t_2.columns = ['t-2']

# t_3 = df['count'].shift(+3).to_frame()
# t_3.columns = ['t-3']

# df_lag = pd.concat([df, t_1, t_2, t_3], axis=1)
# df_lag.head()

In [None]:
# df_lag = df_lag.dropna()
# df_lag.head()

### GETTING DUMMY VARIABLES

In [None]:
df.dtypes

In [None]:
df['season'] = df['season'].astype('category')
df['weather'] = df['weather'].astype('category')
df['time'] = df['time'].astype('category')

In [None]:
df_final = pd.get_dummies(df, drop_first=True)
df_final.head()

### SPLITTING DATA

In [None]:
Y = df_final[['count']]
X = df_final.drop(['count'], axis=1)

In [None]:
#We can't do that random split that we usually perform using traain_test_split func because here we have time-series data or
#time dependent data, ie, demand is auto-correlated. If we randomly take some rows for train, we will damage the auto-
#correlation that we are using in our model. So we will take continuos chunk of rows

l = X.shape[0]
l = int(0.8*l)
X_train = X.values[0:l]
X_cv = X.values[l:]

Y_train = Y.values[0:l]
Y_cv = Y.values[l:]

### TRAINING THE MODEL

In [None]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()

In [None]:
mlr.fit(X_train, Y_train)
Y_pred_cv = mlr.predict(X_cv)

In [None]:
r2_train = mlr.score(X_train, Y_train)
r2_cv = mlr.score(X_cv, Y_cv)
print(r2_train, r2_cv)

In [None]:
from sklearn.metrics import mean_squared_error
import math
rmse = math.sqrt(mean_squared_error(Y_cv, Y_pred_cv))
rmse

### TESTING THE MODEL

In [None]:
df_test.head()

In [None]:
df_test.isnull().sum()

In [None]:
#Lets drop the dates from datetime and keep time in 24 hr format
rows = df_test.shape[0]
for i in range (rows):
    df_test.iloc[i,0] = int(df_test.iloc[i,0][11] + df_test.iloc[i,0][12])    
df_test.head()

In [None]:
df_test['time'] = df_test['datetime']
df_test = df_test.drop(['datetime'], axis=1)
df_test['time'] = df_test['time'].astype('int')

In [None]:
df_test = df_test.drop(['holiday', 'workingday', 'atemp', 'windspeed'], axis=1)

In [None]:
df_test.dtypes

In [None]:
df_test['season'] = df_test['season'].astype('category')
df_test['weather'] = df_test['weather'].astype('category')
df_test['time'] = df_test['time'].astype('category')

In [None]:
df_test = pd.get_dummies(df_test, drop_first=True)
df_test.head()

In [None]:
Y_test = mlr.predict(df_test)

In [None]:
Y_test

In [None]:
Y_final = Y_test.copy()
rows = len(Y_test)
for i in range(rows):
    Y_final[i] = math.ceil(Y_test[i])
#     Y_final[i] = int(Y_final[i])
Y_final = Y_final.astype('int')
Y_final

In [None]:
tmp = pd.Series(Y_final[:, 0])
tmp

In [None]:
Y_submission =df_test_orig[['datetime']]
# tmp = pd.DataFrame(tmp)
# Y_submission = Y_submission.merge(tmp, on=)
Y_submission['count'] = tmp
# tmp
Y_submission

In [None]:
Y_submission.to_csv('answer.csv', index=False)

In [None]:
df_test['count'] = tmp
df_test.head()

### CHECKING FOR AUTOCORRELATION IN COUNT FEATURE

In [None]:
#Since count seems to be a time-series type of data.. it can have auto-correlation. 
#We can see that there is high auto-correlation upto 5 lags. Lets make shifts but till 3 lags only
temp = pd.to_numeric(df['count'], downcast='float')
plt.acorr(temp, maxlags=12)

In [None]:
t_1 = df['count'].shift(+1).to_frame()
t_1.columns = ['t-1']

t_2 = df['count'].shift(+2).to_frame()
t_2.columns = ['t-2']

t_3 = df['count'].shift(+3).to_frame()
t_3.columns = ['t-3']

df_lag = pd.concat([df, t_1, t_2, t_3], axis=1)
df_lag.head()

In [None]:
df_lag = df_lag.dropna()
df_lag.head()

In [None]:
t_1 = df_test['count'].shift(+1).to_frame()
t_1.columns = ['t-1']

t_2 = df_test['count'].shift(+2).to_frame()
t_2.columns = ['t-2']

t_3 = df_test['count'].shift(+3).to_frame()
t_3.columns = ['t-3']

df_lag_test = pd.concat([df_test, t_1, t_2, t_3], axis=1)
df_lag_test.head()

In [None]:
df_lag_test = df_lag_test.dropna()
df_lag_test.head()

In [None]:
Y_lag = df_lag[['count']]
X_lag = df_lag.drop(['count'], axis=1)

In [None]:
l = X_lag.shape[0]
l = int(0.8*l)
X_train_lag = X_lag.values[0:l]
X_cv_lag = X_lag.values[l:]

Y_train_lag = Y_lag.values[0:l]
Y_cv_lag = Y_lag.values[l:]

In [None]:
mlr_lag = LinearRegression()
mlr_lag.fit(X_train_lag, Y_train_lag)
Y_pred_lag = mlr_lag.predict(X_cv_lag)

In [None]:
r2_train_lag = mlr_lag.score(X_train_lag, Y_train_lag)
r2_cv_lag = mlr_lag.score(X_cv_lag, Y_cv_lag)
print(r2_train_lag, r2_cv_lag)

In [None]:
rmse_lag = math.sqrt(mean_squared_error(Y_pred_lag, Y_cv_lag))
rmse_lag

In [None]:
X_test_lag = df_lag.drop(['count'], axis=1)
X_test_lag.head()

In [None]:
Y_ans_lag = mlr_lag.predict(X_test_lag)

rows = len(Y_ans_lag)
for i in range(rows):
    Y_ans_lag[i] = math.ceil(Y_ans_lag[i])
Y_ans_lag = Y_ans_lag.astype('int')

tmp = pd.Series(Y_ans_lag[:, 0])
tmp

Y_submission_lag =df_test_orig[['datetime']]
Y_submission_lag['count'] = tmp
Y_submission_lag

In [None]:
r2 = mlr_lag.score(X_test_lag, Y_lag)
r2

In [None]:
Y_submission_lag.to_csv('answer_lag.csv', index=False)