#### '''This script pulls in data, builds and tests predictive model.'''


#### __author__ = 'Sreetam Kumar Dev'
#### __email__ = 'sreetamkumardev@gmail.com'


In [None]:
# Importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error 
%matplotlib inline

#my_details 
__author__ = "sreetam dev"
__email__  = "sreetamkumardev@gmail.com"

In [None]:
def loading_file_df(file):
    '''stores the required file as a dataframe'''
    df_bike_sharing_day = pd.read_csv(file)
    return  df_bike_sharing_day

def clean_data(df):
    '''removes duplicate instances'''
    df.drop_duplicates()
    return df

def string_datetime(feature,df):
    '''converting dteday to datetime'''
    df[feature] = pd.to_datetime(df[feature])
    return df

def handling_skewness(feature,df):
    '''this step can be performed n times if there are n features affected by skewness'''
    df[feature] = np.log(df[feature] + 1)
    return df
    
def scaling_values(nr_list_features,df):
    '''pass the liost of numerical features that have to be scaled'''
    scaler               =  MinMaxScaler() #initiating a scaler and applying features to it
    df[nr_list_features] = scaler.fit_transform(df[nr_list_features]) # applying noramlisation to numerical variables
    return df

def checking_statistics(x_features,y_tar,df):
    '''fetching the statistical summary '''
    sm.add_constant(df_bike_sharing_day.drop(['instant','dteday','casual','registered','cnt'],axis = 1))

    x_stats = sm.add_constant(x_features)
    y_stats = df_bike_sharing_day[y_tar]
    #applying OLS to our X and Y
    lm      = sm.OLS(y_stats,x_stats).fit()
    lm_sum  = lm.summary()
    return lm_sum

def checking_multicollinearity(list_feat_to_drop,df):
    '''inspecting multicollinearity'''
    df_vif   = df.drop(list_feat_to_drop,axis = 1).assign(const = 1)
    vif_rank = pd.Series([variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])],index = df_vif.columns)
    return vif_rank,df_vif

def get_features(drop_list_features,y_tar,df):
    '''fetching feature and target dataframes as X and y'''
    X_feat = df.drop(drop_list_features, axis = 1)
    y_tar  = df[[y_tar]]
    return X_feat ,y_tar

def model_train(X,y):
    '''training X features and predicting y '''
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state =42)
    
    # Initialize LR model
    lr = LinearRegression()
    
    #Fitting the model
    lr.fit(X_train, y_train)
    
    #finding intercept (B0)
    intercept = lr.intercept_

    #finding the coefficcient parameter (B1)
    coefficient = lr.coef_

    #Make predictions 
    predictions = lr.predict(X_test)
    
    return y_test, intercept, coefficient, predictions

def model_val(y_test, predictions):
    '''making predictions'''
    #Now validation of model
    r2 = format(r2_score(y_test, predictions),'.3f')
    rmse = format(np.sqrt(mean_squared_error(y_test, predictions)), '.3f') #Here we specify 3 digits of precision and f is used to represent floating point number.
    mae = format(mean_absolute_error(y_test, predictions),'.3f')
    
    result_1 =  pd.DataFrame({'Model':['Multiple'], 'R Squared': [r2], 'RMSE': [rmse], 'MAE':[mae]})
    return result_1







In [None]:
'''1.stores the required file as a dataframe'''
df_bike_sharing_day = loading_file_df("day.csv")


'''2.removes duplicate instances'''
df_bike_sharing_day = clean_data(df_bike_sharing_day)


'''3.converting feature to datetime'''
df_bike_sharing_day = string_datetime("dteday",df_bike_sharing_day)

'''4. handling skewness'''
df_bike_sharing_day = handling_skewness('windspeed',df_bike_sharing_day)

'''5.Scaling numerical features'''
nr_list_features = df_bike_sharing_day.describe(include=[np.number]).columns
df_bike_sharing_day = scaling_values(nr_list_features,df_bike_sharing_day)

'''6.Statistical Summary'''
x_features = df_bike_sharing_day.drop(['instant','dteday','casual','registered','cnt'],axis = 1)
lm_sum = checking_statistics(x_features,'cnt',df_bike_sharing_day)

'''7.inspecting multicollinearity'''
list_feat_drop = ['instant','dteday','casual','registered','cnt']
vif_rank, df_bike_sharing_day_vif  = checking_multicollinearity(list_feat_drop,df_bike_sharing_day)


'''8.Fetching feature and target dataframes as X and y'''

drop_list_features = ['season','atemp','instant','dteday','casual','registered','cnt','hum','windspeed']
X_feat ,y_tar = get_features(drop_list_features,'cnt',df_bike_sharing_day)
    
'''9.Model training and prediction for count of bikes'''
y_test, intercept, coefficient, predictions = model_train(X_feat ,y_tar)


'''10.Making predictions'''
result = model_val(y_test, predictions)
