In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading the dataset
df = pd.read_csv("/kaggle/input/bikeshare-data/bike_share.csv")

In [None]:
#exploring the size of dataset
df.shape

In [None]:
#viewing the head of dataset
df.head(25)

In [None]:
#getting information about the dataset
df.info()

In [None]:
#looking for null values in the dataset
df.isna().sum()

In [None]:
#checking for duplicates
df.duplicated().sum()

In [None]:
df.drop_duplicates(keep='first')

In [None]:
#data preprocessing
#1.data_cleaning
#getting column headings
df.columns

In [None]:
#first we separate the features/columns with non-numerical values to encode it.

df_category_columns = df.select_dtypes(exclude = np.number).columns
df_category_columns

In [None]:
#checking whether all the features consist only numerical data by calling the features consist only numerical values
df_number_columns = df.select_dtypes(include = np.number).columns
df_number_columns

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
#data_exploration/overview of data graphically
sns.pairplot(df)

In [None]:
#Check for Multi Collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# the independent variables set 
X = df.select_dtypes(include=np.number).drop(columns=["count"])
  
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
#dropping the features with high multicollinearity
del df["temp"]
del df["atemp"]

In [None]:
df.shape

In [None]:
#saving the preprocessed data in new file.csv
df.to_csv("bike_register_pred_preprocess.csv",index = False)

In [None]:
#reading the preprocessed data
df_preprocessed = pd.read_csv("bike_register_pred_preprocess.csv")

In [None]:
df_preprocessed.head(20)

In [None]:
#now we create a train test slpit to build, validate our model
import sklearn

from sklearn.model_selection import train_test_split

In [None]:
#determing the input/independant features

X = df_preprocessed.drop(columns ="count")

#determing the output/dependant/target feature

y = df_preprocessed["count"]

In [None]:
#spliting the test data with 33%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

In [None]:
#calling the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
#fitting the data in model
model.fit(X_train,y_train)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

In [None]:
#error metrics on train data
pred_train = model.predict(X_train)
print("Mean Absolute Error of train data = ",mean_absolute_error(y_train,pred_train))
print("RMSE of train data = ",np.sqrt(mean_squared_error(y_train, pred_train)))
score = cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_root_mean_squared_error" )
print("Cross validation Score  = ",np.mean(np.abs(score)))
print("Mean Absolute Percentage Error of train data = ", mean_absolute_percentage_error(y_train, pred_train))

In [None]:
#error metics on test data
pred_test = model.predict(X_test)
pred_test = abs(pred_test)
print("Mean Absolute Error of train data = ",mean_absolute_error(y_test,pred_test))
print("RMSE of train data = ",np.sqrt(mean_squared_error(y_test,pred_test)))
score = cross_val_score(model, X_test, y_test, cv = 10, scoring = "neg_root_mean_squared_error" )
print("Cross validation Score  = ",np.mean(np.abs(score)))
print("Mean Absolute Percentage Error of train data = ", mean_absolute_percentage_error(y_test,pred_test))