- Import dependencies

In [2]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

- Load in data from DataCleanup ipynb

In [3]:
data = pd.read_csv('cleaned_bike_data.csv')
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,age,duration,end_lat,end_long,gender,hour,start_lat,start_long,weekend,twenties
0,0,34.0,303,40.733812,-73.980544,0,0,40.740964,-73.986022,0,0
1,1,22.0,700,40.763094,-73.97835,0,0,40.739126,-73.979738,0,1
2,2,49.0,443,40.744449,-73.983035,0,0,40.729515,-73.990753,0,0
3,3,33.0,297,40.71924,-73.95242,0,0,40.710451,-73.960876,0,0
4,4,32.0,421,40.786995,-73.941648,0,0,40.799139,-73.938915,0,0


- Define X and y data from imported csv
- Remember to reshape y array

In [4]:
x_data = data.drop(['Unnamed: 0','age','twenties'],axis=1)
y_data = data['age'].values.reshape(-1, 1)
print(x_data.shape, y_data.shape)

(1550161, 9) (1550161, 1)


- Add dummy columns for each hour of the day

In [5]:
data_binary_encoded = pd.get_dummies(x_data, columns=["hour"])
data_binary_encoded.head()

Unnamed: 0,duration,end_lat,end_long,gender,start_lat,start_long,weekend,twenties,hour_0,hour_1,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,303,40.733812,-73.980544,0,40.740964,-73.986022,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,700,40.763094,-73.97835,0,40.739126,-73.979738,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,443,40.744449,-73.983035,0,40.729515,-73.990753,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,297,40.71924,-73.95242,0,40.710451,-73.960876,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,421,40.786995,-73.941648,0,40.799139,-73.938915,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


- Divide data into training and test samples

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y_data, random_state=42)

- Fit a scaler for X and y using training data

In [7]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

- Apply scaler to training and test samples

In [8]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

- Fit Linear Regression to X and Y training data

In [9]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

- Use mean squared error to determine accuracy of model on test data

In [10]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.6155206086988146, R2: 0.3825900160849648


- Compare training results with test results.
- R^2 could be a lot better, so let's go back to the drawing board.

In [11]:
model.score(X_train_scaled, y_train_scaled)

0.38605956257427121

- Let's try to cluster around the 4 coordinate values provided, and see if we can use those values to our advantage.
- These pairs of beginning and ending coordinates are referred to as "lanes"

In [12]:
km_test = x_data[['start_lat','start_long','end_lat','end_long']]
km_test.head()

Unnamed: 0,start_lat,start_long,end_lat,end_long
0,40.740964,-73.986022,40.733812,-73.980544
1,40.739126,-73.979738,40.763094,-73.97835
2,40.729515,-73.990753,40.744449,-73.983035
3,40.710451,-73.960876,40.71924,-73.95242
4,40.799139,-73.938915,40.786995,-73.941648


- We used the code below to determine the optimal number of clusters. We landed at 10.

In [None]:
# from sklearn.cluster import KMeans

# r2s = []
# k_data = x_data.drop(['start_lat','start_long','end_lat','end_long'], axis=1)

# for n in range(3,15):
#     kmeans = KMeans(n_clusters=n)
#     kmeans.fit(km_test)
#     predicted_clusters = kmeans.predict(km_test)
    
#     k_data['trip_cluster'] = predicted_clusters
#     k_data_encoded = pd.get_dummies(k_data, columns=['trip_cluster','hour'])
    
#     X_train, X_test, y_train, y_test = train_test_split(k_data_encoded, y_data, random_state=42)
    
#     X_scaler = StandardScaler().fit(X_train)
#     y_scaler = StandardScaler().fit(y_train)
    
#     X_train_scaled = X_scaler.transform(X_train)
#     X_test_scaled = X_scaler.transform(X_test)
#     y_train_scaled = y_scaler.transform(y_train)
#     y_test_scaled = y_scaler.transform(y_test)
    
#     model = LinearRegression()
#     model.fit(X_train_scaled, y_train_scaled)
    
#     predictions = model.predict(X_test_scaled)
#     MSE = mean_squared_error(y_test_scaled, predictions)
#     r2 = model.score(X_test_scaled, y_test_scaled)
    
#     r2s.append(r2)
#     print(f"{n} clusters yielded r2 = {r2}, MSE = {MSE}")
    
    

- We'll use KMeans from SciKitLearn to break the data into 10 clusters. These are visualized in Tableau at https://public.tableau.com/profile/stefan.sampaleanu#!/vizhome/CitiBike_MachineLearningSupplement/Sheet1

In [14]:
from sklearn.cluster import KMeans

k_data = x_data.drop(['start_lat','start_long','end_lat','end_long'], axis=1)

kmeans = KMeans(n_clusters=10)
kmeans.fit(km_test)
predicted_clusters = kmeans.predict(km_test)

k_data['trip_cluster'] = predicted_clusters
k_data_encoded = pd.get_dummies(k, columns=['trip_cluster','hour'])
k_data_encoded.head()



Unnamed: 0,duration,gender,weekend,twenties,trip_cluster_0,trip_cluster_1,trip_cluster_2,trip_cluster_3,trip_cluster_4,trip_cluster_5,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,303,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,700,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,443,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,297,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,421,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


- Scale the data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(k_data_encoded, y_data, random_state=42)

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

- Fit the regression again and print some output. Clustering didn't seem to change much, as r^2 is still around .38.

In [16]:
model.fit(X_train_scaled, y_train_scaled)

predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"Adding 10 trip clusters yielded r2 = {r2}, MSE = {MSE}")

Adding 10 trip clusters yielded r2 = 0.3836420252353503, MSE = 0.614471818867954


- Let's try a Random Forest Regression. Since our data is large, we'll go for a high number of estimators at 300.

In [17]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=300)
regr.fit(X_train_scaled, y_train_scaled)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

- Maybe scratch that for this regression.

In [18]:
regr.score(X_test_scaled,y_test_scaled)

0.22787651127741992

# Summary
We couldn't find a high enough R^2, so let's try redefining the question in search of a more accurate result. In the CitiBike_NeuralNetwork notebook, we'll try a Neural Network to predict whether or not a rider is in their 20s or not.