<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Load-Datasets" data-toc-modified-id="Load-Datasets-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load Datasets</a></span></li></ul></div>

## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import skew, boxcox_normmax
from scipy.special import boxcox1p




from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import cross_validate, cross_val_score, \
        train_test_split, learning_curve, validation_curve
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, ShuffleSplit
from statsmodels.api import add_constant

## Load Datasets

In [None]:
hours_df = pd.read_csv("https://raw.githubusercontent.com/ashomah/Bike-Sharing-in-Washington/master/Bike-Sharing-Dataset/hour.csv")
hours_df.info()

In [None]:
days_df = pd.read_csv("https://raw.githubusercontent.com/ashomah/Bike-Sharing-in-Washington/master/Bike-Sharing-Dataset/day.csv")
days_df.info()

In [None]:
########################### HOURS DATASET ###########################
# Renaming columns names to more readable names
hours_df.rename(columns={'instant':'id',
                        'dteday':'date',
                        'weathersit':'weather_condition',
                        'hum':'humidity',
                        'mnth':'month',
                        'cnt':'total_bikes',
                        'hr':'hour',
                        'yr':'year',
                        'temp':'actual_temp',
                        'atemp':'feeling_temp'},inplace=True)
###########################
# Setting proper data types
###########################
# date time conversion
hours_df.date = pd.to_datetime(hours_df.date, format='%Y-%m-%d')
# categorical variables
for column in ['season', 'holiday', 'weekday', 'workingday', 'weather_condition','month', 'year','hour']:
    hours_df[column] = hours_df[column].astype('category')
    
########################### DAYS DATASET ###########################
# Renaming columns names to more readable names
days_df.rename(columns={'instant':'id',
                        'dteday':'date',
                        'weathersit':'weather_condition',
                        'hum':'humidity',
                        'mnth':'month',
                        'cnt':'total_bikes',
                        'yr':'year',
                        'temp':'actual_temp',
                        'atemp':'feeling_temp'},inplace=True)
###########################
# Setting proper data types
###########################
# date time conversion
days_df.date = pd.to_datetime(days_df.date, format='%Y-%m-%d')
# categorical variables
for column in ['season', 'holiday', 'weekday', 'workingday', 'weather_condition','month', 'year']:
    hours_df[column] = hours_df[column].astype('category')
    
#####################################################################
print('################################# HOURS DATASET #############################################')
print(hours_df.head())
print('################################# HOURS DATASET #############################################')
print(hours_df.describe())
print('################################# DAYS DATASET #############################################')
print(days_df.head())
print('################################# DAYS DATASET #############################################')
print(days_df.describe())



In [None]:
sns.distplot(days_df['humidity'], color='g', bins=100, hist_kws={'alpha': 0.4}) ;


In [None]:
days_df_num = days_df.select_dtypes(include = ['float64', 'int64']);
days_df_num.hist(figsize=(10, 14), bins=50, xlabelsize=3, ylabelsize=3, color ='y');

In [None]:
days_num_corr = days_df_num.corr()['total_bikes'][:-1] # -1 because the latest row is SalePrice
golden_features_list = days_num_corr[abs(days_num_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Total Bikes:\n{}".format(len(golden_features_list), golden_features_list))

In [None]:
for i in range(0, len(days_df_num.columns), 5):
    sns.pairplot(data=days_df_num,
                x_vars=days_df_num.columns[i:i+5],
                y_vars=['total_bikes'])

In [None]:
corr = days_df_num.drop('total_bikes', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [None]:
sns.distplot(hours_df['humidity'], color='g', bins=100, hist_kws={'alpha': 0.4}) ;

In [None]:
hours_df_num = hours_df.select_dtypes(include = ['float64', 'int64']);
hours_df_num.hist(figsize=(10, 14), bins=50, xlabelsize=3, ylabelsize=3, color='y');

In [None]:
hours_num_corr = hours_df_num.corr()['total_bikes'][:-1] # -1 because the latest row is SalePrice
golden_features_list_1 = hours_num_corr[abs(hours_num_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Total Bikes:\n{}".format(len(golden_features_list_1), golden_features_list_1))

In [None]:
for i in range(0, len(hours_df_num.columns), 5):
    sns.pairplot(data=hours_df_num, 
                x_vars=hours_df_num.columns[i:i+5],
                y_vars=['total_bikes'])

In [None]:
corr_1 = hours_df_num.drop('total_bikes', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr_1[(corr_1 >= 0.5) | (corr_1 <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [None]:
hours_df_cat = hours_df.select_dtypes(include = ['category']);
hours_df_cat.head()

In [None]:
plt.figure(figsize = (11, 6))
ax = sns.boxplot(x='weather_condition', y='total_bikes', data=hours_df)
plt.setp(ax.artists, alpha=1, linewidth=0.5, edgecolor="k")
plt.xlabel('Weather Condition')
plt.ylabel('Total Bikes')


In [None]:
plt.figure(figsize = (10, 6))
ax = sns.boxplot(x='workingday', y='total_bikes', data=hours_df)
plt.setp(ax.artists, alpha=1, linewidth=0.5, edgecolor="k")
plt.xlabel('Working Day')
plt.ylabel('Total Bikes')

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.boxplot(x='weekday', y='total_bikes', data=hours_df)
plt.setp(ax.artists, alpha=1, linewidth=0.5, edgecolor="k", color='y')
plt.xlabel('Week Day')
plt.ylabel('Total Bikes')

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.boxplot(x='holiday', y='total_bikes', data=hours_df)
plt.setp(ax.artists, alpha=1, linewidth=0.5, edgecolor="k")
plt.xlabel('Holidays')
plt.ylabel('Total Bikes')

In [None]:
fig, axes = plt.subplots(round(len(hours_df_cat.columns) / 2), 2, figsize=(16,10))

for i, ax in enumerate(fig.axes):
    if i < len(hours_df_cat.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=0)
        sns.countplot(x=hours_df_cat.columns[i], alpha=0.7, data=hours_df, ax=ax, color = 'y')
        ax.set(ylabel=' ')

        

fig.tight_layout()

In [None]:
hours_df.isnull().values.any()
hours_df.isnull().sum()


In [None]:
days_df.isnull().values.any()
days_df.isnull().sum()

In [None]:
days_df.head()

In [None]:
def feature_skewness(df):
    numeric_dtypes = ['int16', 'int32', 'int64', 
                      'float16', 'float32', 'float64']
    numeric_features = []
    for i in df.columns:
        if df[i].dtype in numeric_dtypes: 
            numeric_features.append(i)

    feature_skew = df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews = pd.DataFrame({'skew':feature_skew})
    return feature_skew, numeric_features

In [None]:
def fix_skewness(df):
    feature_skew, numeric_features = feature_skewness(df)
    high_skew = feature_skew[feature_skew > 0.5]
    skew_index = high_skew.index
    
    for i in skew_index:
        df[i] = boxcox1p(df[i], boxcox_normmax(df[i]+1))

    skew_features = df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews = pd.DataFrame({'skew':skew_features})
    return df

In [None]:
fix_skewness(hours_df)


In [None]:
fix_skewness(days_df)

In [None]:
def date_features(df):
    columns = df.columns
    return df.select_dtypes(include=[np.datetime64]).columns

def numerical_features(df):
    columns = df.columns
    return df._get_numeric_data().columns

def categorical_features(df):
    numerical_columns = numerical_features(df)
    date_columns = date_features(df)
    return(list(set(df.columns) - set(numerical_columns) - set(date_columns) ))

def onehot_encode(df):
    numericals = df.get(numerical_features(df))
    new_df = numericals.copy()
    for categorical_column in categorical_features(df):
        new_df = pd.concat([new_df, 
                            pd.get_dummies(df[categorical_column], 
                                           prefix=categorical_column)], 
                           axis=1)
    return new_df

In [None]:
onehot_encode(hours_df)

In [None]:
onehot_encode(days_df)

In [None]:
def remove_outliers(df):
    x = df.drop(['total_bikes','date'], axis=1)
    y = df.total_bikes.reset_index(drop=True)
    ols = sm.OLS(endog = y.astype(float), exog = x.astype(float))
    fit = ols.fit()
    test = fit.outlier_test()['bonf(p)']
    outliers = list(test[test<1e-3].index) 
    df.drop(df.index[outliers])
    return df

In [None]:
remove_outliers(hours_df)


In [None]:

X = pd.DataFrame(hours_df.drop('date', axis=1))
y = pd.DataFrame(hours_df, columns=['total_bikes'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=25)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=12)

lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

print('Intercept:', lm.intercept_)
print('Coefficients:', lm.coef_)
print('Mean squared error (MSE): {:.2f}'.format(mean_squared_error(y_test, y_pred)))
print('Variance score (R2): {:.2f}'.format(r2_score(y_test, y_pred)))