In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#modelling and metrics libraries
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
#read the dataset
data = pd.read_csv("/kaggle/input/bikeshare-data/bike_share.csv")
data_orig = data.copy()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.head()

In [None]:
#Dropping the features casual, registered
data.drop(columns=["casual","registered"],inplace=True)
data.head()

In [None]:
#check for any null values
data.isna().sum()

In [None]:
#check for duplicate observations
data.duplicated().value_counts()

In [None]:
data[data.duplicated()]

In [None]:
#remove duplicate observations
data.drop_duplicates(inplace=True)
data.duplicated().value_counts()

In [None]:
data.nunique()

In [None]:
#find observations that can be converted to dimenstions
[print(data[i].value_counts(),"\n") for i in data.columns if data[i].nunique()<=20]

In [None]:
def print_scatter(y_act,y_pred):
    plt.xlabel("Actual expenses")
    plt.ylabel("Predicted expenses")
    plt.scatter(y_act,y_pred)

def print_rpt(y_act,y_pred,lbl=""):
    print('*'*20)
    print(lbl,"RMSE:",np.sqrt(mean_squared_error(y_act,y_pred)))
    print(lbl,"MAE :",mean_absolute_error(y_act,y_pred))
    print(lbl,"MAPE:",(np.mean(abs(np.array(y_act)-np.array(y_pred))/np.array(y_act)))*100)
    print(lbl,"R2  :",r2_score(y_act,y_pred))
    print('-'*20)
    print_scatter(y_act,y_pred)
    
def predict_compute(X,y):
    display(X.head(),y.head())
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=8)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    model = LinearRegression()
    model.fit(X_train,y_train)
    print('Slope:',model.coef_,"\nIntercept",model.intercept_)
    
    y_train_pred = model.predict(X_train)
    print_rpt(y_train,y_train_pred,'Train')

    y_test_pred = model.predict(X_test)
    print_rpt(y_test,y_test_pred,'Test ')
    
    return X_train,X_test,y_train,y_train_pred,y_test,y_test_pred

In [None]:
#Script to find the outliers
def find_outlier(cols=data.select_dtypes(include=np.number).columns[:-1]):
    for col_name in cols:
        if  data[col_name].nunique()>2:
            print("*"*10)
            print(col_name)
            q1 = data[col_name].quantile(0.25)
            q3 = data[col_name].quantile(0.75)
            iqr = q3 - q1

            low = q1-1.5*iqr
            high = q3+1.5*iqr
            out = data.loc[(data[col_name] < low) | (data[col_name] > high), col_name]
            cnt_out = out.count()
            tot_obs = data.shape[0]
            per = round((cnt_out/tot_obs)*100,2)
            print("Outliers = ",cnt_out,"out of",tot_obs,"observations ->",per,"%\n",out)

In [None]:
# calculating VIF for each feature
def calc_VIF(X):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns

    vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                              for i in range(len(X.columns))]

    print(vif_data)

In [None]:
find_outlier()

In [None]:
data.describe().T

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(12,5))
sns.heatmap(data.corr(),annot=True)
plt.show()

In [None]:
plt.figure(figsize=(12,5))
data.drop(columns = "count").boxplot()
plt.show()

In [None]:
sns.pairplot(data)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X = data.drop(columns="count")
calc_VIF(X)
Z = data.drop(columns=["atemp","count"])
calc_VIF(Z)

In [None]:
X = data.drop(columns=['count'])
y = data[['count']]
#y_train,y_train_pred,y_test,y_test_pred = predict_compute(X,y)
model = LinearRegression()
model.fit(X,y)
print('Slope:',model.coef_,"\nIntercept",model.intercept_)
y_pred = model.predict(X)

print("RMSE:",np.sqrt(mean_squared_error(y,y_pred)))
print("MAE :",mean_absolute_error(y,y_pred))
print("MAPE:",(np.mean(abs(np.array(y)-np.array(y_pred))/np.array(y)))*100)
print("R2  :",r2_score(y,y_pred))


In [None]:
plt.xlabel("Actual expenses")
plt.ylabel("Predicted expenses")
plt.scatter(y,y_pred)

In [None]:
error = y-y_pred
sns.displot(error)

In [None]:
plt.scatter( y_pred, error)

In [None]:
X = data.drop(columns='count')
y = data[['count']]

In [None]:
#cross validation
kf = KFold(n_splits=10)

In [None]:
data.shape, X.shape, y.shape, type(X), type(y)

In [None]:
X

In [None]:

i=0
test_result= []
test_result_r2 =[]
for train_index, test_index in kf.split(X):
    train_X = X.iloc[train_index]
    train_y = y.iloc[train_index]
    test_X = X.iloc[test_index]
    test_y = y.iloc[test_index]
    
    model = LinearRegression()
    model.fit(train_X, train_y)

    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    
    train_rmse = np.sqrt(mean_squared_error(train_y, train_pred))
    test_rmse = np.sqrt(mean_squared_error(test_y, test_pred))
    train_r2 = r2_score(train_y,train_pred)
    test_r2 = r2_score(test_y,test_pred)
    i+=1
    #print(i, "Fold")
    print(i,"Train_RMSE =",train_rmse,"R2 =",train_r2,
          "Test RMSE  =",test_rmse, "R2 =",test_r2)

    test_result.append(test_rmse)
    test_result_r2.append(test_r2)
    
print("\nRMSE Mean:",np.mean(test_result),"STD:",np.std(test_result),
    "Range:",np.mean(test_result)-np.std(test_result),"-",
      np.mean(test_result)+np.std(test_result))

print("R2   Mean:",np.mean(test_result_r2),"STD:",np.std(test_result_r2),
    "Range:",np.mean(test_result_r2)-np.std(test_result_r2),"-",
      np.mean(test_result_r2)+np.std(test_result_r2))

In [None]:
model = LinearRegression()
np.mean(np.abs(cross_val_score(model,X,y,scoring='neg_root_mean_squared_error',cv=10)))

In [None]:
X_train,X_test,y_train,y_train_pred,y_test,y_test_pred = predict_compute(X,y)

In [None]:
act = np.sqrt(mean_squared_error(y_train, y_train_pred))
pred = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("RMSE Actual:",act,"Predicted:",pred,"Diff:", (np.abs(act-pred)/act)*100)

r2 = r2_score(y_train,y_train_pred)
n,p = X_train.shape[0], X_train.shape[1]
print("R2 :",r2," X-train shape =",n,p)

adjr2 = 1-(((1-r2)*(n-1))/(n-p-1))
print("Adj-R2:",adjr2)