# TP2: Machine Learning

In [1]:
import pandas as pd

# import ML packages
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn import linear_model
from sklearn import preprocessing

# import Plotting pckgs
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
trainingSet = pd.read_csv('../CSVs/improved_trip_train.csv')
testingSet = pd.read_csv('../CSVs/improved_trip_test.csv')
testingOriginal = pd.read_csv('../CSVs/trip_test.csv')
trainingSet.dtypes

id                     int64
duration               int64
start_date            object
start_dayOfWeek        int64
start_week             int64
start_quarter          int64
start_time            object
start_hour             int64
start_minute           int64
start_station_name    object
start_station_id       int64
end_date              object
end_dayOfWeek          int64
end_week               int64
end_quarter            int64
end_time              object
end_hour               int64
end_minute             int64
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object

In [3]:
trainingSet.head()

Unnamed: 0,id,duration,start_date,start_dayOfWeek,start_week,start_quarter,start_time,start_hour,start_minute,start_station_name,...,end_week,end_quarter,end_time,end_hour,end_minute,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,907649,396,2015-08-27,3,35,3,08:36:00,8,36,Harry Bridges Plaza (Ferry Building),...,35,3,08:43:00,8,43,Embarcadero at Sansome,60,187,Subscriber,94602
1,384043,636,2014-07-28,0,31,3,22:06:00,22,6,Market at 10th,...,31,3,22:17:00,22,17,Washington at Kearny,46,417,Subscriber,94133
2,316176,334,2014-06-09,0,24,2,08:42:00,8,42,Market at Sansome,...,24,2,08:47:00,8,47,2nd at Folsom,62,281,Subscriber,94107
3,618874,666,2015-01-26,0,5,1,16:55:00,16,55,San Francisco Caltrain 2 (330 Townsend),...,5,1,17:07:00,17,7,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602
4,910977,318,2015-08-29,5,35,3,15:09:00,15,9,Market at 10th,...,35,3,15:14:00,15,14,Powell Street BART,39,607,Subscriber,94709


In [4]:
trainingSet.shape

(549961, 23)

In [5]:
testingSet.head()

Unnamed: 0,id,start_date,start_dayOfWeek,start_week,start_quarter,start_time,start_hour,start_minute,start_station_name,start_station_id,...,end_week,end_quarter,end_time,end_hour,end_minute,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,504737,2014-10-18,5,42,4,11:25:00,11,25,Embarcadero at Sansome,60,...,42,4,12:12:00,12,12,Powell at Post (Union Square),71,426,Customer,77009
1,530846,2014-11-05,2,45,4,13:00:00,13,0,Embarcadero at Folsom,51,...,45,4,13:09:00,13,9,Broadway St at Battery St,82,454,Subscriber,94132
2,813140,2015-06-18,3,25,2,17:34:00,17,34,San Francisco Caltrain (Townsend at 4th),70,...,25,2,17:37:00,17,37,2nd at Townsend,61,370,Subscriber,94107
3,897674,2015-08-20,3,34,3,07:06:00,7,6,Civic Center BART (7th at Market),72,...,34,3,07:15:00,7,15,Townsend at 7th,65,451,Subscriber,94582
4,322830,2014-06-13,4,24,2,08:46:00,8,46,San Francisco Caltrain 2 (330 Townsend),69,...,24,2,08:57:00,8,57,Embarcadero at Folsom,51,603,Subscriber,95014


In [6]:
trainingSet.duration.mean()

1113.4153549069842

### Let's study correlation between data

In [7]:
trainingSet.corr()['duration']

id                 -0.003133
duration            1.000000
start_dayOfWeek     0.018351
start_week          0.004002
start_quarter       0.003648
start_hour          0.002598
start_minute        0.002586
start_station_id   -0.007001
end_dayOfWeek       0.013753
end_week            0.002100
end_quarter         0.001376
end_hour            0.009626
end_minute         -0.000530
end_station_id     -0.006026
bike_id            -0.001569
Name: duration, dtype: float64

We can see that the duration is most correlated with:
    - start_dayOfWeek     0.018351
    - end_dayOfWeek       0.013753
    - end_hour            0.009626
    - start_station_id   -0.007001

In [8]:
columns = trainingSet.columns.tolist()
columns

['id',
 'duration',
 'start_date',
 'start_dayOfWeek',
 'start_week',
 'start_quarter',
 'start_time',
 'start_hour',
 'start_minute',
 'start_station_name',
 'start_station_id',
 'end_date',
 'end_dayOfWeek',
 'end_week',
 'end_quarter',
 'end_time',
 'end_hour',
 'end_minute',
 'end_station_name',
 'end_station_id',
 'bike_id',
 'subscription_type',
 'zip_code']

In [9]:
trainingSet.dtypes

id                     int64
duration               int64
start_date            object
start_dayOfWeek        int64
start_week             int64
start_quarter          int64
start_time            object
start_hour             int64
start_minute           int64
start_station_name    object
start_station_id       int64
end_date              object
end_dayOfWeek          int64
end_week               int64
end_quarter            int64
end_time              object
end_hour               int64
end_minute             int64
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object

In [10]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [11]:
trainingSet.shape

(549961, 23)

In [12]:
# Train the model using the training sets
regr.fit(trainingSet[['start_dayOfWeek', 'end_dayOfWeek', 'end_hour', 'start_station_id']], trainingSet.duration)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
prediction = regr.predict(testingSet[['start_dayOfWeek', 'end_dayOfWeek', 'end_hour', 'start_station_id']])

In [14]:
prediction

array([ 1603.57291184,  1069.66886328,  1298.9958606 , ...,  1784.36773396,
        1544.50655641,   596.68919234])

In [15]:
duration = { 
    'id' : testingSet.id,
    'duration' : prediction}
df_duration = pd.DataFrame(duration, columns = ['id','duration'])

In [16]:
df_duration

Unnamed: 0,id,duration
0,504737,1603.572912
1,530846,1069.668863
2,813140,1298.995861
3,897674,836.063931
4,322830,1126.487657
5,487841,1118.016993
6,677808,1150.710172
7,704449,835.598940
8,833587,1760.988055
9,420411,1534.923806


# HISTORICO

In [87]:
#Armamos un Data Frame mas comodo para trabajar. Nuestro objetivo es calcular el HISTORICO de cada viaje
#Historico: promedio de duracion de viajes entre una estacion y otra.
trainingShort = trainingSet.loc[:,['id','duration','start_station_name','start_station_id','end_station_name','end_station_id']]
trainingShort

Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id
0,907649,396,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60
1,384043,636,Market at 10th,67,Washington at Kearny,46
2,316176,334,Market at Sansome,77,2nd at Folsom,62
3,618874,666,San Francisco Caltrain 2 (330 Townsend),69,Temporary Transbay Terminal (Howard at Beale),55
4,910977,318,Market at 10th,67,Powell Street BART,39
5,522083,337,Townsend at 7th,65,San Francisco Caltrain (Townsend at 4th),70
6,880809,394,2nd at South Park,64,2nd at Townsend,61
7,488938,766,Powell at Post (Union Square),71,San Francisco Caltrain (Townsend at 4th),70
8,899522,531,2nd at Folsom,62,San Francisco Caltrain (Townsend at 4th),70
9,737380,267,Market at 4th,76,Mechanics Plaza (Market at Battery),75


## Modo prueba para probar el algoritmo

In [40]:
prueba = {
    'id': [1,2,3,4,5],
    'duration':[31,21,10,15,6],
    'start':[1,1,3,2,3],
    'end':[2,3,2,1,2]
}

dfPrueba = pd.DataFrame(prueba,columns = ['id','duration','start','end'])
dfPrueba

Unnamed: 0,id,duration,start,end
0,1,31,1,2
1,2,21,1,3
2,3,10,3,2
3,4,15,2,1
4,5,6,3,2


In [56]:
#Armamos unas listas cuyos elementos son las ID de las estaciones de llegada y salida
#Eliminamos los repetidos
listaStart = []
listaEnd = []
for i in list(dfPrueba.start.values):
    if i not in listaStart:
        listaStart.append(i)
for i in list(dfPrueba.end.values):
    if i not in listaEnd:
        listaEnd.append(i)

In [73]:
#Creamos una Tupla de (idSalida,idLlegada,historico)
#Armamos una lista con las tuplas.
#No agregamos los Nan. En este caso seran las tuplas donde no existe un historico, debido a que entre esas dos estaciones 
#nunca se llevo a cabo un viaje
listaHistorico = []
for i in listaStart:
    for j in listaEnd:
        df = dfPrueba[(dfPrueba['start'] == i) & (dfPrueba['end'] == j)]
        historico = df.duration.mean()
        if (not(math.isnan(historico))):
            listaHistorico.append([i,j,historico])
        
listaHistorico

[[1, 2, 31.0], [1, 3, 21.0], [3, 2, 8.0], [2, 1, 15.0]]

In [77]:
starStationId = []
endStationId = []
historical = []
for x in listaHistorico:
    starStationId.append(x[0])
    endStationId.append(x[1])
    historical.append(x[2])

data = {
    'start' : starStationId,
    'end' : endStationId,
    'historical' : historical,
}

dfData = pd.DataFrame(data,columns = ['start','end','historical'])
dfData

Unnamed: 0,start,end,historical
0,1,2,31
1,1,3,21
2,3,2,8
3,2,1,15


In [79]:
result = pd.merge(dfPrueba,dfData,on =['start','end'],how = 'inner')
result

Unnamed: 0,id,duration,start,end,historical
0,1,31,1,2,31
1,2,21,1,3,21
2,3,10,3,2,8
3,5,6,3,2,8
4,4,15,2,1,15


## Ahora la prueba de fuego con el Data Frame del Training Reducido

In [90]:
listaStart = []
listaEnd = []
for i in list(trainingShort.start_station_id.values):
    if i not in listaStart:
        listaStart.append(i)
for i in list(trainingShort.end_station_id.values):
    if i not in listaEnd:
        listaEnd.append(i)


In [91]:
#Tarda un rato.
listaHistorico = []
for i in listaStart:
    for j in listaEnd:
        df = trainingShort[(trainingShort['start_station_id'] == i) & (trainingShort['end_station_id'] == j)]
        historico = df.duration.mean()
        if (not(math.isnan(historico))):
            listaHistorico.append([i,j,historico])
        
listaHistorico

[[50, 60, 1254.993877147936],
 [50, 46, 2679.90099009901],
 [50, 62, 2177.8607350096713],
 [50, 55, 502.29606625258799],
 [50, 39, 1938.610909090909],
 [50, 70, 903.78094462540719],
 [50, 61, 662.55546241967033],
 [50, 75, 955.34452296819791],
 [50, 67, 2432.1732026143791],
 [50, 72, 7406.0220588235297],
 [50, 51, 1104.3922651933701],
 [50, 76, 1278.5260663507108],
 [50, 50, 6343.4900849858359],
 [50, 66, 3062.5027027027027],
 [50, 54, 788.76531671858777],
 [50, 64, 842.403162055336],
 [50, 65, 1070.6219047619047],
 [50, 45, 846.55012224938878],
 [50, 77, 834.04506699147385],
 [50, 82, 1168.3443396226414],
 [50, 74, 4673.119565217391],
 [50, 68, 968.3078101071975],
 [50, 71, 2404.4898477157362],
 [50, 56, 2053.1680000000001],
 [50, 49, 453.73065015479875],
 [50, 57, 925.07014028056108],
 [50, 63, 661.69466882067854],
 [50, 48, 1801.127619047619],
 [50, 69, 1076.7346760070052],
 [50, 47, 1400.2985074626865],
 [50, 42, 1246.1496062992126],
 [50, 41, 1677.0087336244542],
 [50, 73, 2730.59

In [92]:
starStationId = []
endStationId = []
historical = []
for x in listaHistorico:
    starStationId.append(x[0])
    endStationId.append(x[1])
    historical.append(x[2])

data = {
    'start_station_id' : starStationId,
    'end_station_id' : endStationId,
    'historical' : historical,
}

dfData = pd.DataFrame(data,columns = ['start_station_id','end_station_id','historical'])
dfData

Unnamed: 0,start_station_id,end_station_id,historical
0,50,60,1254.993877
1,50,46,2679.900990
2,50,62,2177.860735
3,50,55,502.296066
4,50,39,1938.610909
5,50,70,903.780945
6,50,61,662.555462
7,50,75,955.344523
8,50,67,2432.173203
9,50,72,7406.022059


In [94]:
trainingShort = pd.merge(trainingShort,dfData,on =['start_station_id','end_station_id'],how = 'inner')
trainingShort

Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,historical
0,907649,396,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
1,807007,421,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
2,343864,9624,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
3,451164,1670,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
4,101973,484,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
5,502639,363,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
6,842301,446,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
7,300655,385,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
8,781372,388,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
9,727968,517,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,1254.993877
