# Tanguy Pellerin

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from numpy.linalg import inv

In [3]:
df = pd.read_csv("Bike Rental UCI dataset.csv")
df.dropna()

FileNotFoundError: File b'./Bike-Sharing-Dataset/Bike Rental UCI dataset.csv' does not exist

In [None]:
df.describe()

# Feature Engineering

In [None]:
X = df.drop("demand", axis=1)
y = df.demand
X.dayOfWeek = X.dayOfWeek.astype('category')
X["day"] = X.dayOfWeek.cat.codes
X = X.drop("dayOfWeek",axis=1)
X.drop(columns={"days", "yr", "mnth", "hr"}, inplace=True)


# Gradient descent

In [None]:
def cost(theta,X,y):
    m = len(y)
    
    predictions = X.dot(theta)
    cost = (1/2*m) * np.sum(np.square(predictions-y))
    return cost


def gradient(X,y,theta,learning_rate,iterations):
    m = len(y)
    cost_history = np.zeros(iterations)
    theta_history = np.zeros((iterations,X.shape[1]))
    for i in range(iterations):
        
        prediction = np.dot(X,theta)
        
        theta = theta -(1/m)*learning_rate*( X.T.dot((prediction - y)))
        theta_history[i,:] =theta.T
        cost_history[i]  = cost(theta,X,y)
        
    return theta, cost_history, theta_history

In [None]:
learning_rate = 0.08
iteration = 2000
theta = np.array([0]*X.shape[1])
theta,cost_history,theta_history = gradient(X,y,theta,learning_rate,iteration)

In [None]:
print(theta)
print("\n\nR2 : ",metrics.r2_score(X.dot(theta), y))

In [None]:
plt.figure(figsize=(15,10))
plt.xlabel("iteration")
plt.ylabel("J(Theta)")
plt.plot(range(iteration),cost_history,'b.')

# Normal equation

In [None]:
def normal_equation(X, y):   
    return inv(X.T.dot(X)).dot(X.T).dot(y)   

theta = normal_equation(X,y)
print(theta)
print("\n\nR2 : ",metrics.r2_score(X.dot(theta), y))

# Scikit learn

## Train test split

In [None]:
Xdf = df
Xdf.dayOfWeek = Xdf.dayOfWeek.astype('category')
Xdf["day"] = Xdf.dayOfWeek.cat.codes
Xdf = Xdf.drop(["dayOfWeek", "demand"],axis=1)
ydf = df.demand

X_train, X_test, y_train, y_test = train_test_split(Xdf, ydf, train_size=0.8)

## Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Score : ",model.score(X_test, y_test)*100)

## Random Forest

In [None]:
model = RandomForestRegressor(n_estimators=400, max_depth=8)
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Score",model.score(X_test,y_test)*100)