# Classification, Regression and Other Prediction Model

## Dataset

We‘ll use "201707-citibike-tripdata.csv.zip" (after preprocessed in HW0)

## Schema

- Every station’s information
    - id, name, lat, lng
- Every stations’ flow data
    - id, time, in-flow, out-flow

### Import packages

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import os
from time import time
from plotly.graph_objs import *
from mpl_toolkits.mplot3d import Axes3D
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

### Read csv to dataframe
use pandas to read data

In [42]:
# preprocessed dataset
df = pd.read_csv('./201707-citibike-tripdata-preprocessed.csv')
df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,364,2017-07-01 00:00:00,2017-07-01 00:06:05,539,Metropolitan Ave & Bedford Ave,40.715348,-73.960241,3107,Bedford Ave & Nassau Ave,40.723117,-73.952123,14744,Subscriber,1986.0,1
1,2142,2017-07-01 00:00:03,2017-07-01 00:35:46,293,Lafayette St & E 8 St,40.730207,-73.991026,3425,2 Ave & E 104 St,40.78921,-73.943708,19587,Subscriber,1981.0,1
2,328,2017-07-01 00:00:08,2017-07-01 00:05:37,3242,Schermerhorn St & Court St,40.691029,-73.991834,3397,Court St & Nelson St,40.676395,-73.998699,27937,Subscriber,1984.0,2
3,2530,2017-07-01 00:00:11,2017-07-01 00:42:22,2002,Wythe Ave & Metropolitan Ave,40.716887,-73.963198,398,Atlantic Ave & Furman St,40.691652,-73.999979,26066,Subscriber,1985.0,1
4,2534,2017-07-01 00:00:15,2017-07-01 00:42:29,2002,Wythe Ave & Metropolitan Ave,40.716887,-73.963198,398,Atlantic Ave & Furman St,40.691652,-73.999979,29408,Subscriber,1982.0,2


In [43]:
# every station's information
station_info = pd.read_csv('./station_info.csv')
station_info.head()

Unnamed: 0,station id,station name,station latitude,station logitude
0,539,Metropolitan Ave & Bedford Ave,40.715348,-73.960241
1,293,Lafayette St & E 8 St,40.730207,-73.991026
2,3242,Schermerhorn St & Court St,40.691029,-73.991834
3,2002,Wythe Ave & Metropolitan Ave,40.716887,-73.963198
4,361,Allen St & Hester St,40.716059,-73.991908


In [44]:
# every station's in-flow data
station_in_flow = pd.read_csv('./in_flow.csv')
station_in_flow.head()

Unnamed: 0,72,79,82,83,116,119,120,127,128,143,...,2003,2005,2006,2008,2009,2010,2012,2021,2022,2023
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,...,1.0,0.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,1.0
2,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# every station's out-flow data
station_out_flow = pd.read_csv('./out_flow.csv')
station_out_flow.head()

Unnamed: 0,72,79,82,83,116,119,120,127,128,143,...,2003,2005,2006,2008,2009,2010,2012,2021,2022,2023
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,...,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Using historical (14 days) data to predict every station's outflow tomorrow (1 day)

### Extract following values

- station_id
- outflow(and this is we want to predict)

In [46]:
station_out_flow.head()

Unnamed: 0,72,79,82,83,116,119,120,127,128,143,...,2003,2005,2006,2008,2009,2010,2012,2021,2022,2023
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,...,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Discretize outflow
- discretize with divided by every station's outflow standard deviation and round to integer
- process them so it can be solve as a classification problem

By previous homework's results, we can find divided by 5 is a good way to discretize so apply it and round the value to integer.

In [95]:
station_out_dis = (station_out_flow / 5).round(0)
station_out_dis.head()

Unnamed: 0,72,79,82,83,116,119,120,127,128,143,...,2003,2005,2006,2008,2009,2010,2012,2021,2022,2023
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use previous (14 days) data to estimate next days’ outflow
- use a sliding window to increase our data (shift k days each time, and determine the k = 1 )

In [48]:
def get_data(idx, st):
    df = pd.DataFrame(station_out_dis.iloc[st * 48 : (15 + st) * 48, idx]).T
    df.columns = [i for i in range(df.shape[1])]
    return df

def get_station(idx):
    data = pd.DataFrame()
    res = []
    for i in range(16):
        data = data.append(get_data(idx, i))
    return data

- We can use ```get_station(index)``` to get the station's outflow data from 7/01 - 7/15 to 7/16 - 7/30 in each row

In [49]:
get_station(1).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,710,711,712,713,714,715,716,717,718,719
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Evaluate each model and calculate the mean accuracy and time

In [50]:
def eval_model(clf):
    ans = 0
    t = time()
    for idx in range(634):
        data = get_station(idx)
        train_x, test_x, train_y, test_y = train_test_split(data.iloc[:, :14 * 48], data.iloc[:, 14 * 48:], test_size = 0.3)
        for i in range(48):
            ans += clf.fit(train_x, train_y.iloc[:, i]).score(test_x, test_y.iloc[:, i])
    print 'average accuracy for 48 timeslot: {:.4f}'.format((ans / 634.0 / 48.0))
    print 'time: {:.2f} sec'.format(time() - t)

## Try following models (as classification problem)

compare the computation time and result ( average accuracy for 48 timeslot )

### K-Nearest-Neighbor

By previous homework, the results of Kmeans and PCA => Agglomerative Clustering look like we can divided the data into 3 - 4 parts so we choose k = 3 or 4.

In [51]:
clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors = 3))
eval_model(clf)

average accuracy for 48 timeslot: 0.9054
time: 77.86 sec


In [52]:
clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors = 4))
eval_model(clf)

average accuracy for 48 timeslot: 0.9117
time: 74.71 sec


### Naive Bayes

Choose the multinomial naive bayes because the outflows are match the multinomial model rather than Gaussian during weekday and weekend.

[package](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

In [53]:
clf = OneVsRestClassifier(MultinomialNB())
eval_model(clf)

average accuracy for 48 timeslot: 0.9105
time: 74.25 sec


### Random Forest

Setting max_depth to prevent the decision tree being too deep leads to overfitting and wasting time

[package](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [54]:
clf = OneVsRestClassifier(RandomForestClassifier(max_depth = 2))
eval_model(clf)

average accuracy for 48 timeslot: 0.9071
time: 332.95 sec


### Support vector machine(SVC)

Choose LinearSVC because it is the scalable Linear Support Vector Machine for classification implemented using liblinear.

Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

This class supports both dense and sparse input and the multiclass support is handled according to a one-vs-the-rest scheme.

[package](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC)

In [55]:
clf = OneVsRestClassifier(LinearSVC())
eval_model(clf)

average accuracy for 48 timeslot: 0.9129
time: 74.06 sec


### other


### Compare and Observation

## Calculate the confusion matrix

generate the label by collecting all the target values and construct the confusion matrix.

In [107]:
label = set()

for i in range(48):
    for item in pd.unique(station_out_dis.iloc[:, -i]):
        label.add(item)
label = list(label)
num_l = len(label)

clf = OneVsRestClassifier(MultinomialNB())
mat = np.zeros([num_l, num_l], dtype = np.int)
for idx in range(634):
        data = get_station(idx)
        train_x, test_x, train_y, test_y = train_test_split(data.iloc[:, :14 * 48], data.iloc[:, 14 * 48:], test_size = 0.3)
        mat += (confusion_matrix(clf.fit(train_x, train_y.iloc[:, 0]).predict(test_x), test_y.iloc[:, 0], labels = label))

Print the confusion matrix for predicting the first hour in one day
for Naive Bayes.

In [108]:
mat

array([[2975,  151,    4,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  26,   13,    1,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],

## Performance with different parameters in SVM

## Try following models (as regression problem)

compare the computation time and result ( Mean square error )

### ARIMA

In [112]:
import statsmodels.api as sm

ImportError: cannot import name datetools

### Bayesian regression

### Decision tree regression

### Support vector machine(SVR)

### other

### Compare and Observation

our own