In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline

In [None]:
df=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()    
# Transaction_id is unique, transaction initiated time is almost distinct unique, country column is same for all the tuples,
# so we can remove it

In [None]:
df['count'].value_counts()  # need to see its distribution

In [None]:
df.head()

In [None]:
df.isna().sum()  # There are no null values

In [None]:
df.describe(include = 'all')

In [None]:
df.columns

In [None]:
#season            4
#holiday           2
#workingday        2
#weather           4

for i in ['season', 'holiday', 'workingday', 'weather']:
    x = df[i].nunique()
    y = list(df[i].unique())
    z = df[i].value_counts()
    print('**************************************')
    print('\n')
    print(f'Featue {i} has {x} unique variables \n\n The list of unique variables is {y} \n\n and their count is \n\n {z} ')

In [None]:
cat_cols = ['season', 'holiday', 'workingday', 'weather']
df[cat_cols] = df[cat_cols].astype('category')

In [None]:
df.dtypes

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])
df.dtypes

In [None]:
df['datetime_year'] = df['datetime'].dt.year
df['datetime_date'] = df['datetime'].dt.day
df['datetime_month']=df['datetime'].dt.month
df['datetime_hour'] = df['datetime'].dt.hour
df['datetime_weekday']=df['datetime'].dt.weekday


In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
df.drop('datetime', axis = 1, inplace = True) # --> Need to do this

In [None]:
df.describe(include = 'all')

In [None]:
df.head()

# EDA

## BiVariate Analysis

In [None]:
#Correlation for the numerical attributes

#temp	atemp	humidity	windspeed	casual	registered	count

df_corr = df[['temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']] 
corr = df_corr.corr(method ='pearson') 
corr


In [None]:
#plot correlation using  heatmap
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
df.dtypes

In [None]:
ax = sns.boxplot(x="datetime_year", y="count", data=df)


In [None]:
ax = sns.boxplot(x="datetime_date", y="count", data=df)

In [None]:
ax = sns.boxplot(x="datetime_month", y="count", data=df)

In [None]:
ax = sns.boxplot(x="datetime_hour", y="count", data=df)

In [None]:
ax = sns.boxplot(x="datetime_weekday", y="count", data=df)

# Feature  Engineering

In [None]:
#Drop date and weekday; Keep year , month and datetime_hour

In [None]:
cat_cols = ['datetime_year', 'datetime_month','datetime_hour' ]
df[cat_cols] = df[cat_cols].astype('category')

In [None]:
df.dtypes

In [None]:
df.drop(['season','windspeed','registered', 'casual', 'datetime_date', 'datetime_weekday', ], axis = 1, inplace = True )

In [None]:
df.drop('atemp', axis = 1, inplace = True) # atemp and temp highly correlated; count and temp has higher correlation so drop atemp

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
df.shape

In [None]:
###Done with train data

## Test data

In [None]:
test.shape

In [None]:
final_pred = test['datetime']

In [None]:
final_pred.head()

In [None]:
test.isna().sum() # no null values - good

In [None]:
test.dtypes

In [None]:
test['datetime'] = pd.to_datetime(test['datetime'])

In [None]:
test['datetime_year'] = test['datetime'].dt.year
test['datetime_date'] = test['datetime'].dt.day
test['datetime_month']=test['datetime'].dt.month
test['datetime_hour'] = test['datetime'].dt.hour
test['datetime_weekday']=test['datetime'].dt.weekday

In [None]:
test.dtypes

In [None]:
cat_cols = ['holiday','workingday','weather','datetime_year', 'datetime_month', 'datetime_hour']
test[cat_cols] = test[cat_cols].astype('category')

In [None]:
test.dtypes

In [None]:
test.drop(['datetime','season','atemp', 'windspeed','datetime_date', 'datetime_weekday'], axis = 1, inplace = True)

In [None]:
test.dtypes

In [None]:
test.head().T

In [None]:
test.shape

# Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor


from xgboost import XGBClassifier

# Splitting train and validation data

In [None]:
y=df["count"]
X=df.drop('count', axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20,random_state=340)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

## Imputation

In [None]:
df.dtypes

In [None]:
#num_attr = list(df.select_dtypes(['int64', 'float64']).columns)
num_attr=['temp','humidity']
print(num_attr)
cat_attr = ['holiday', 'workingday', 'weather', 'datetime_year', 'datetime_month', 'datetime_hour']
print(cat_attr)

In [None]:
imputer = SimpleImputer(strategy='median')

imputer = imputer.fit(X_train[num_attr])

X_train[num_attr] = imputer.transform(X_train[num_attr])
X_val[num_attr] = imputer.transform(X_val[num_attr])
test[num_attr] = imputer.transform(test[num_attr])

In [None]:
imputer = SimpleImputer(strategy='most_frequent')

imputer = imputer.fit(X_train[cat_attr])

X_train[cat_attr] = imputer.transform(X_train[cat_attr])
X_val[cat_attr] = imputer.transform(X_val[cat_attr])
test[cat_attr] = imputer.transform(test[cat_attr])

In [None]:
mapper = DataFrameMapper(
  [([continuous_col], StandardScaler()) for continuous_col in num_attr] +
  [([categorical_col], OneHotEncoder(handle_unknown='error')) for categorical_col in cat_attr]
, df_out=True)

In [None]:
mapper.fit(X_train)

X_train_final = mapper.transform(X_train)
X_val_final = mapper.transform(X_val)
test_final = mapper.transform(test)

In [None]:
X_train_final.head()

In [None]:
X_train_final.shape

In [None]:
X_train_final.columns

## Model Building

In [None]:
def plot(y_pred_val, y_val, title):
    plt.figure(figsize=(8, 5))
    sns.distplot(y_pred_val, hist=False, color='r', label='predicted validation data')
    sns.distplot(y_val, hist=False, color='b', label='actual validation data')
    plt.title(title)
    plt.show()

# Linear Regression

In [None]:
lr = LinearRegression()
Input1 = [('scale', StandardScaler()), ('model', lr)]
pipe1 = Pipeline(Input1)
pipe1.fit(X_train_final, y_train)
y_pred_val_lr = pipe1.predict(X_val_final)

In [None]:
lr_score = r2_score(y_pred_val_lr, y_val)
print('The r^2 score for Linear Regression is {}'.format(lr_score))
print('The mean absolute error is {}'.format(np.mean(np.absolute(y_pred_val_lr-y_val))))
print('Residual sum of squares : {}'.format(np.mean(y_pred_val_lr-y_val)**2))    

In [None]:
plot(y_pred_val_lr, y_val, 'Linear Regression')

# Lasso Regression

In [None]:
lasso_model = Lasso(alpha=0.4)
lasso_model.fit(X_train_final, y_train)
y_pred_val_lasso_reg = lasso_model.predict(X_val_final)

In [None]:
lasso_score = r2_score(y_pred_val_lasso_reg, y_val)
print('The r^2 score for Lasso Regression is {}'.format(lasso_score))
print('The mean absolute error is {}'.format(np.mean(np.absolute(y_pred_val_lasso_reg-y_val))))
print('Residual sum of squares : {}'.format(np.mean(y_pred_val_lasso_reg-y_val)**2))  

In [None]:
plot(y_pred_val_lasso_reg, y_val,'Lasso Regression')

# Ridge Regression

In [None]:
parameter = [{'alpha': [0.001, 0.1, 0.4, 10, 100, 1000, 10000, 100000]}]
rr = Ridge()
grid1 = GridSearchCV(rr, parameter, cv=10)
grid1.fit(X_train_final, y_train)
grid1.best_estimator_
scores = grid1.cv_results_
scores['mean_test_score']


In [None]:
ridge_model = Ridge(alpha=0.4)
ridge_model.fit(X_train_final, y_train)
y_pred_val_ridge_reg = ridge_model.predict(X_val_final)

In [None]:
ridge_score = r2_score(y_pred_val_ridge_reg, y_val)
print('The r^2 score for Ridge Regression is {}'.format(ridge_score))
print('The mean absolute error is {}'.format(np.mean(np.absolute(y_pred_val_ridge_reg-y_val))))
print('Residual sum of squares : {}'.format(np.mean(y_pred_val_ridge_reg-y_val)**2))  

In [None]:
plot(y_pred_val_ridge_reg, y_val, 'Ridge Regression')

# Decision Tree

In [None]:
bike_tree = DecisionTreeRegressor(random_state=1)
bike_tree.fit(X_train_final, y_train)
y_pred_val_decisiontree_reg = bike_tree.predict(X_val_final)


In [None]:
dtr_score = r2_score(y_pred_val_decisiontree_reg, y_val)
print('The r^2 score for Decision Tree classifier is {}'.format(dtr_score))
print('The mean absolute error is {}'.format(np.mean(np.absolute(y_pred_val_decisiontree_reg-y_val))))
print('Residual sum of squares : {}'.format(np.mean(y_pred_val_decisiontree_reg-y_val)**2))  

In [None]:
plot(y_pred_val_decisiontree_reg, y_val,'Decision Tree Regressor')

# Random Forest Regression

# XGB regressor

In [None]:

XGB_model = XGBRegressor(random_state=2)
XGB_model.fit(X_train_final, y_train)
y_pred_val_xgb_reg = XGB_model.predict(X_val_final)


In [None]:

xgb_score = r2_score(y_pred_val_xgb_reg, y_val)
print('The r^2 score for XGBRegressor is {}'.format(xgb_score))
print('The mean absolute error is {}'.format(np.mean(np.absolute(y_pred_val_xgb_reg-y_val))))
print('Residual sum of squares : {}'.format(np.mean(y_pred_val_xgb_reg-y_val)**2))  

In [None]:
plot(y_pred_val_xgb_reg, y_val, 'XGBRegressor')

# Predictions_output

In [None]:
#y_pred_test = GBM_model.predict(test_final)
y_pred_test = bike_tree.predict(test_final)
y_pred_test


In [None]:
data = {'datetime':final_pred, 'count':y_pred_test}
predictions = pd.DataFrame(data)
predictions.head()


In [None]:
predictions['count'].value_counts()

In [None]:
print(predictions)
#predictions.to_csv('submission.csv', index=False)
to_export=pd.DataFrame(predictions)
to_export.to_csv('submission.csv', index=False)