# TP2: Machine Learning

### Imports

In [16]:
import pandas as pd
from datetime import datetime
import scipy.spatial
from sklearn import preprocessing

### Data loading

In [17]:
stationDF = pd.read_csv('../CSVs/station.csv')
trainingSet = pd.read_csv('../CSVs/trip_train.csv')
testingSet = pd.read_csv('../CSVs/trip_test.csv')

## Basic data analysis

In [18]:
stationDF

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.893200,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013
5,7,Paseo de San Antonio,37.333798,-121.886943,15,San Jose,8/7/2013
6,8,San Salvador at 1st,37.330165,-121.885831,15,San Jose,8/5/2013
7,9,Japantown,37.348742,-121.894715,15,San Jose,8/5/2013
8,10,San Jose City Hall,37.337391,-121.886995,15,San Jose,8/6/2013
9,11,MLK Library,37.335885,-121.885660,19,San Jose,8/6/2013


In [19]:
print "stationDF.shape: ", stationDF.shape

stationDF.shape:  (70, 7)


In [20]:
distancesDF = pd.DataFrame(columns=["start_station_id", "end_station_id", "distance"])

In [21]:
for station, lat, lon in zip(stationDF.id, stationDF.lat, stationDF.long):
    for station2, lat2, lon2 in zip(stationDF.id, stationDF.lat, stationDF.long):
        distancesDF = distancesDF.append({
            "start_station_id": station,
            "end_station_id": station2,
            "distance": scipy.spatial.distance.cityblock([lat, lon], [lat2, lon2])
        }, ignore_index=True)
        

        

In [22]:
distancesDF

Unnamed: 0,start_station_id,end_station_id,distance
0,2,2,0.000000
1,2,3,0.013769
2,2,4,0.011136
3,2,5,0.010265
4,2,6,0.014697
5,2,7,0.018905
6,2,8,0.016384
7,2,9,0.026077
8,2,10,0.022446
9,2,11,0.022275


In [23]:
distancesDF['start_station_id'] = distancesDF.start_station_id.astype(int)
distancesDF['end_station_id'] = distancesDF.end_station_id.astype(int)
distancesDF

Unnamed: 0,start_station_id,end_station_id,distance
0,2,2,0.000000
1,2,3,0.013769
2,2,4,0.011136
3,2,5,0.010265
4,2,6,0.014697
5,2,7,0.018905
6,2,8,0.016384
7,2,9,0.026077
8,2,10,0.022446
9,2,11,0.022275


### Training set

In [24]:
trainingSet.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,94602
1,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,94133
2,316176,334,6/9/2014 8:42,Market at Sansome,77,6/9/2014 8:47,2nd at Folsom,62,281,Subscriber,94107
3,618874,666,1/26/2015 16:55,San Francisco Caltrain 2 (330 Townsend),69,1/26/2015 17:07,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602
4,910977,318,8/29/2015 15:09,Market at 10th,67,8/29/2015 15:14,Powell Street BART,39,607,Subscriber,94709


In [25]:
trainingSet.dtypes

id                     int64
duration               int64
start_date            object
start_station_name    object
start_station_id       int64
end_date              object
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object

### convert to date to datetime

In [26]:
trainingSet['start_date'] = pd.to_datetime(trainingSet.start_date)
trainingSet['end_date'] = pd.to_datetime(trainingSet.end_date)

In [27]:
testingSet['start_date'] = pd.to_datetime(testingSet.start_date)
testingSet['end_date'] = pd.to_datetime(testingSet.end_date)

In [28]:
trainingSet.dtypes

id                             int64
duration                       int64
start_date            datetime64[ns]
start_station_name            object
start_station_id               int64
end_date              datetime64[ns]
end_station_name              object
end_station_id                 int64
bike_id                        int64
subscription_type             object
zip_code                      object
dtype: object

In [29]:
trainingSet['start_dayOfWeek'] = trainingSet.start_date.dt.dayofweek
trainingSet['start_week'] = trainingSet.start_date.dt.week
trainingSet['start_quarter'] = trainingSet.start_date.dt.quarter
trainingSet['start_time'] = trainingSet.start_date.dt.time
trainingSet['start_hour'] = trainingSet.start_date.dt.hour
trainingSet['start_minute'] = trainingSet.start_date.dt.minute
trainingSet['start_year'] = trainingSet.start_date.dt.year
trainingSet['start_month'] = trainingSet.start_date.dt.month
trainingSet['start_day'] = trainingSet.start_date.dt.day
trainingSet['start_date'] = trainingSet.start_date.dt.date

trainingSet['end_dayOfWeek'] = trainingSet.end_date.dt.dayofweek
trainingSet['end_week'] = trainingSet.end_date.dt.week
trainingSet['end_quarter'] = trainingSet.end_date.dt.quarter
trainingSet['end_time'] = trainingSet.end_date.dt.time
trainingSet['end_hour'] = trainingSet.end_date.dt.hour
trainingSet['end_minute'] = trainingSet.end_date.dt.minute
trainingSet['end_year'] = trainingSet.end_date.dt.year
trainingSet['end_month'] = trainingSet.end_date.dt.month
trainingSet['end_day'] = trainingSet.end_date.dt.day
trainingSet['end_date'] = trainingSet.end_date.dt.date

In [30]:
testingSet['start_dayOfWeek'] = testingSet.start_date.dt.dayofweek
testingSet['start_week'] = testingSet.start_date.dt.week
testingSet['start_quarter'] = testingSet.start_date.dt.quarter
testingSet['start_time'] = testingSet.start_date.dt.time
testingSet['start_hour'] = testingSet.start_date.dt.hour
testingSet['start_minute'] = testingSet.start_date.dt.minute
testingSet['start_date'] = testingSet.start_date.dt.date

testingSet['end_dayOfWeek'] = testingSet.end_date.dt.dayofweek
testingSet['end_week'] = testingSet.end_date.dt.week
testingSet['end_quarter'] =testingSet.end_date.dt.quarter
testingSet['end_time'] = testingSet.end_date.dt.time
testingSet['end_hour'] = testingSet.end_date.dt.hour
testingSet['end_minute'] = testingSet.end_date.dt.minute
testingSet['end_date'] = testingSet.end_date.dt.date

In [31]:
trainingSet.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,start_day,end_dayOfWeek,end_week,end_quarter,end_time,end_hour,end_minute,end_year,end_month,end_day
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,27,3,35,3,08:43:00,8,43,2015,8,27
1,384043,636,2014-07-28,Market at 10th,67,2014-07-28,Washington at Kearny,46,417,Subscriber,...,28,0,31,3,22:17:00,22,17,2014,7,28
2,316176,334,2014-06-09,Market at Sansome,77,2014-06-09,2nd at Folsom,62,281,Subscriber,...,9,0,24,2,08:47:00,8,47,2014,6,9
3,618874,666,2015-01-26,San Francisco Caltrain 2 (330 Townsend),69,2015-01-26,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,...,26,0,5,1,17:07:00,17,7,2015,1,26
4,910977,318,2015-08-29,Market at 10th,67,2015-08-29,Powell Street BART,39,607,Subscriber,...,29,5,35,3,15:14:00,15,14,2015,8,29


In [32]:
list(trainingSet.columns.values)

['id',
 'duration',
 'start_date',
 'start_station_name',
 'start_station_id',
 'end_date',
 'end_station_name',
 'end_station_id',
 'bike_id',
 'subscription_type',
 'zip_code',
 'start_dayOfWeek',
 'start_week',
 'start_quarter',
 'start_time',
 'start_hour',
 'start_minute',
 'start_year',
 'start_month',
 'start_day',
 'end_dayOfWeek',
 'end_week',
 'end_quarter',
 'end_time',
 'end_hour',
 'end_minute',
 'end_year',
 'end_month',
 'end_day']

In [33]:
list(testingSet.columns.values)

['id',
 'start_date',
 'start_station_name',
 'start_station_id',
 'end_date',
 'end_station_name',
 'end_station_id',
 'bike_id',
 'subscription_type',
 'zip_code',
 'start_dayOfWeek',
 'start_week',
 'start_quarter',
 'start_time',
 'start_hour',
 'start_minute',
 'end_dayOfWeek',
 'end_week',
 'end_quarter',
 'end_time',
 'end_hour',
 'end_minute']

In [34]:
testingSet = testingSet[['id', 
                           'start_date',
                           'start_dayOfWeek',
                           'start_week',
                           'start_quarter', 
                           'start_time',
                           'start_hour',
                           'start_minute',
                           'start_station_name', 
                           'start_station_id', 
                           'end_date', 
                           'end_dayOfWeek',
                           'end_week',
                           'end_quarter', 
                           'end_time',
                           'end_hour',
                           'end_minute',
                           'end_station_name', 
                           'end_station_id', 
                           'bike_id',
                           'subscription_type',
                           'zip_code']]
                  

In [35]:
trainingSet.dtypes

id                     int64
duration               int64
start_date            object
start_station_name    object
start_station_id       int64
end_date              object
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
start_dayOfWeek        int64
start_week             int64
start_quarter          int64
start_time            object
start_hour             int64
start_minute           int64
start_year             int64
start_month            int64
start_day              int64
end_dayOfWeek          int64
end_week               int64
end_quarter            int64
end_time              object
end_hour               int64
end_minute             int64
end_year               int64
end_month              int64
end_day                int64
dtype: object

In [36]:
testingSet.dtypes

id                     int64
start_date            object
start_dayOfWeek        int64
start_week             int64
start_quarter          int64
start_time            object
start_hour             int64
start_minute           int64
start_station_name    object
start_station_id       int64
end_date              object
end_dayOfWeek          int64
end_week               int64
end_quarter            int64
end_time              object
end_hour               int64
end_minute             int64
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object

In [37]:
trainingSet

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,start_day,end_dayOfWeek,end_week,end_quarter,end_time,end_hour,end_minute,end_year,end_month,end_day
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,27,3,35,3,08:43:00,8,43,2015,8,27
1,384043,636,2014-07-28,Market at 10th,67,2014-07-28,Washington at Kearny,46,417,Subscriber,...,28,0,31,3,22:17:00,22,17,2014,7,28
2,316176,334,2014-06-09,Market at Sansome,77,2014-06-09,2nd at Folsom,62,281,Subscriber,...,9,0,24,2,08:47:00,8,47,2014,6,9
3,618874,666,2015-01-26,San Francisco Caltrain 2 (330 Townsend),69,2015-01-26,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,...,26,0,5,1,17:07:00,17,7,2015,1,26
4,910977,318,2015-08-29,Market at 10th,67,2015-08-29,Powell Street BART,39,607,Subscriber,...,29,5,35,3,15:14:00,15,14,2015,8,29
5,522083,337,2014-10-30,Townsend at 7th,65,2014-10-30,San Francisco Caltrain (Townsend at 4th),70,370,Subscriber,...,30,3,44,4,07:06:00,7,6,2014,10,30
6,880809,394,2015-08-07,2nd at South Park,64,2015-08-07,2nd at Townsend,61,443,Subscriber,...,7,4,32,3,17:17:00,17,17,2015,8,7
7,488938,766,2014-10-08,Powell at Post (Union Square),71,2014-10-08,San Francisco Caltrain (Townsend at 4th),70,485,Subscriber,...,8,2,41,4,14:23:00,14,23,2014,10,8
8,899522,531,2015-08-21,2nd at Folsom,62,2015-08-21,San Francisco Caltrain (Townsend at 4th),70,603,Subscriber,...,21,4,34,3,08:02:00,8,2,2015,8,21
9,737380,267,2015-04-23,Market at 4th,76,2015-04-23,Mechanics Plaza (Market at Battery),75,86,Customer,...,23,3,17,2,06:40:00,6,40,2015,4,23


In [38]:
testingSet

Unnamed: 0,id,start_date,start_dayOfWeek,start_week,start_quarter,start_time,start_hour,start_minute,start_station_name,start_station_id,...,end_week,end_quarter,end_time,end_hour,end_minute,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,504737,2014-10-18,5,42,4,11:25:00,11,25,Embarcadero at Sansome,60,...,42,4,12:12:00,12,12,Powell at Post (Union Square),71,426,Customer,77009
1,530846,2014-11-05,2,45,4,13:00:00,13,0,Embarcadero at Folsom,51,...,45,4,13:09:00,13,9,Broadway St at Battery St,82,454,Subscriber,94132
2,813140,2015-06-18,3,25,2,17:34:00,17,34,San Francisco Caltrain (Townsend at 4th),70,...,25,2,17:37:00,17,37,2nd at Townsend,61,370,Subscriber,94107
3,897674,2015-08-20,3,34,3,07:06:00,7,6,Civic Center BART (7th at Market),72,...,34,3,07:15:00,7,15,Townsend at 7th,65,451,Subscriber,94582
4,322830,2014-06-13,4,24,2,08:46:00,8,46,San Francisco Caltrain 2 (330 Townsend),69,...,24,2,08:57:00,8,57,Embarcadero at Folsom,51,603,Subscriber,95014
5,487841,2014-10-07,1,41,4,21:41:00,21,41,2nd at Townsend,61,...,41,4,21:47:00,21,47,Post at Kearny,47,478,Subscriber,94115
6,677808,2015-03-11,2,11,1,18:09:00,18,9,Market at 10th,67,...,11,1,18:22:00,18,22,San Francisco Caltrain (Townsend at 4th),70,505,Subscriber,94025
7,704449,2015-03-30,0,14,1,17:29:00,17,29,Embarcadero at Vallejo,48,...,14,1,17:35:00,17,35,Steuart at Market,74,356,Subscriber,94536
8,833587,2015-07-05,6,27,3,11:54:00,11,54,Market at 10th,67,...,27,3,12:00:00,12,0,Market at 4th,76,401,Subscriber,94102
9,420411,2014-08-22,4,34,3,13:30:00,13,30,Embarcadero at Vallejo,48,...,34,3,13:41:00,13,41,5th at Howard,57,363,Subscriber,94114


## Save processed data

In [39]:
trainingSet.to_csv('../CSVs/improved_trip_train.csv', index=False)

In [40]:
testingSet.to_csv('../CSVs/improved_trip_test.csv', index=False)

# ///////////////////////////////////////////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////////////////////////////////////////

In [41]:
stationDF = pd.read_csv('../CSVs/station.csv')
trainingSet = pd.read_csv('../CSVs/improved_trip_train.csv')
testingSet = pd.read_csv('../CSVs/improved_trip_test.csv')

## Feature Distancia

In [42]:
distancesDF = pd.DataFrame(columns=["start_station_id", "end_station_id", "distance"])
for station, lat, lon in zip(stationDF.id, stationDF.lat, stationDF.long):
    for station2, lat2, lon2 in zip(stationDF.id, stationDF.lat, stationDF.long):
        distancesDF = distancesDF.append({
            "start_station_id": station,
            "end_station_id": station2,
            "distance": scipy.spatial.distance.cityblock([lat, lon], [lat2, lon2])
        }, ignore_index=True)
        
distancesDF['start_station_id'] = distancesDF.start_station_id.astype(int)
distancesDF['end_station_id'] = distancesDF.end_station_id.astype(int)
distancesDF    

Unnamed: 0,start_station_id,end_station_id,distance
0,2,2,0.000000
1,2,3,0.013769
2,2,4,0.011136
3,2,5,0.010265
4,2,6,0.014697
5,2,7,0.018905
6,2,8,0.016384
7,2,9,0.026077
8,2,10,0.022446
9,2,11,0.022275


In [43]:
trainingSet = pd.merge(trainingSet,distancesDF,on =['start_station_id','end_station_id'],how = 'inner')


In [44]:
testingSet = pd.merge(testingSet,distancesDF,on =['start_station_id','end_station_id'],how = 'inner')

## Feature Historico

In [45]:
trainingShort = trainingSet.loc[:,['id','duration','start_station_name','start_station_id','end_station_name','end_station_id']]

In [46]:
import math
listaStart = []
listaEnd = []
for i in list(trainingShort.start_station_id.values):
    if i not in listaStart:
        listaStart.append(i)
for i in list(trainingShort.end_station_id.values):
    if i not in listaEnd:
        listaEnd.append(i)
listaHistorico = []
for i in listaStart:
    for j in listaEnd:
        df = trainingShort[(trainingShort['start_station_id'] == i) & (trainingShort['end_station_id'] == j)]
        historico = df.duration.mean()
        if (not(math.isnan(historico))):
            listaHistorico.append([i,j,historico])
        
listaHistorico

[[50, 60, 1254.993877147936],
 [50, 46, 2679.90099009901],
 [50, 62, 2177.8607350096713],
 [50, 55, 502.29606625258799],
 [50, 39, 1938.610909090909],
 [50, 70, 903.78094462540719],
 [50, 61, 662.55546241967033],
 [50, 75, 955.34452296819791],
 [50, 67, 2432.1732026143791],
 [50, 72, 7406.0220588235297],
 [50, 51, 1104.3922651933701],
 [50, 76, 1278.5260663507108],
 [50, 50, 6343.4900849858359],
 [50, 66, 3062.5027027027027],
 [50, 54, 788.76531671858777],
 [50, 64, 842.403162055336],
 [50, 65, 1070.6219047619047],
 [50, 45, 846.55012224938878],
 [50, 77, 834.04506699147385],
 [50, 82, 1168.3443396226414],
 [50, 74, 4673.119565217391],
 [50, 68, 968.3078101071975],
 [50, 71, 2404.4898477157362],
 [50, 56, 2053.1680000000001],
 [50, 49, 453.73065015479875],
 [50, 57, 925.07014028056108],
 [50, 63, 661.69466882067854],
 [50, 48, 1801.127619047619],
 [50, 69, 1076.7346760070052],
 [50, 47, 1400.2985074626865],
 [50, 42, 1246.1496062992126],
 [50, 41, 1677.0087336244542],
 [50, 73, 2730.59

In [47]:
starStationId = []
endStationId = []
historical = []
for x in listaHistorico:
    starStationId.append(x[0])
    endStationId.append(x[1])
    historical.append(x[2])

data = {
    'start_station_id' : starStationId,
    'end_station_id' : endStationId,
    'historical' : historical,
}

dfData = pd.DataFrame(data,columns = ['start_station_id','end_station_id','historical'])
dfData

Unnamed: 0,start_station_id,end_station_id,historical
0,50,60,1254.993877
1,50,46,2679.900990
2,50,62,2177.860735
3,50,55,502.296066
4,50,39,1938.610909
5,50,70,903.780945
6,50,61,662.555462
7,50,75,955.344523
8,50,67,2432.173203
9,50,72,7406.022059


In [48]:
trainingShort = pd.merge(trainingShort,dfData,on =['start_station_id','end_station_id'],how = 'inner')
trainingShort = trainingShort [['id', 'historical']]
trainingSet = pd.merge(trainingSet,trainingShort,on =['id'],how = 'inner')


In [49]:
trainingSet['historical'] = trainingSet.historical.astype(int)
trainingSet

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,end_week,end_quarter,end_time,end_hour,end_minute,end_year,end_month,end_day,distance,historical
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,35,3,08:43:00,8,43,2015,8,27,0.018409,1254
1,807007,421,2015-06-15,Harry Bridges Plaza (Ferry Building),50,2015-06-15,Embarcadero at Sansome,60,321,Subscriber,...,25,2,09:11:00,9,11,2015,6,15,0.018409,1254
2,343864,9624,2014-06-29,Harry Bridges Plaza (Ferry Building),50,2014-06-29,Embarcadero at Sansome,60,451,Customer,...,26,2,16:04:00,16,4,2014,6,29,0.018409,1254
3,451164,1670,2014-09-13,Harry Bridges Plaza (Ferry Building),50,2014-09-13,Embarcadero at Sansome,60,564,Customer,...,37,3,04:19:00,4,19,2014,9,13,0.018409,1254
4,101973,484,2013-11-20,Harry Bridges Plaza (Ferry Building),50,2013-11-20,Embarcadero at Sansome,60,416,Customer,...,47,4,09:45:00,9,45,2013,11,20,0.018409,1254
5,502639,363,2014-10-16,Harry Bridges Plaza (Ferry Building),50,2014-10-16,Embarcadero at Sansome,60,401,Subscriber,...,42,4,19:05:00,19,5,2014,10,16,0.018409,1254
6,842301,446,2015-07-11,Harry Bridges Plaza (Ferry Building),50,2015-07-11,Embarcadero at Sansome,60,579,Subscriber,...,28,3,10:46:00,10,46,2015,7,11,0.018409,1254
7,300655,385,2014-05-28,Harry Bridges Plaza (Ferry Building),50,2014-05-28,Embarcadero at Sansome,60,412,Subscriber,...,22,2,08:43:00,8,43,2014,5,28,0.018409,1254
8,781372,388,2015-05-26,Harry Bridges Plaza (Ferry Building),50,2015-05-26,Embarcadero at Sansome,60,587,Subscriber,...,22,2,17:01:00,17,1,2015,5,26,0.018409,1254
9,727968,517,2015-04-16,Harry Bridges Plaza (Ferry Building),50,2015-04-16,Embarcadero at Sansome,60,449,Subscriber,...,16,2,08:53:00,8,53,2015,4,16,0.018409,1254


## Agrego datos de TP1

In [50]:
trainingSet.loc [:,['id','duration','start_station_id', 'end_station_id','start_time', 'end_time','historical','distance' ]]

Unnamed: 0,id,duration,start_station_id,end_station_id,start_time,end_time,historical,distance
0,907649,396,50,60,08:36:00,08:43:00,1254,0.018409
1,807007,421,50,60,09:04:00,09:11:00,1254,0.018409
2,343864,9624,50,60,13:24:00,16:04:00,1254,0.018409
3,451164,1670,50,60,03:51:00,04:19:00,1254,0.018409
4,101973,484,50,60,09:37:00,09:45:00,1254,0.018409
5,502639,363,50,60,18:59:00,19:05:00,1254,0.018409
6,842301,446,50,60,10:39:00,10:46:00,1254,0.018409
7,300655,385,50,60,08:37:00,08:43:00,1254,0.018409
8,781372,388,50,60,16:54:00,17:01:00,1254,0.018409
9,727968,517,50,60,08:44:00,08:53:00,1254,0.018409


In [51]:
 dfTrip = pd.read_csv('../CSVs/trip.csv')
# dfTrip = dfTrip.loc[:,['id','duration']]
# dfTrip = dfTrip.rename(columns={'duration':'durationPosta'})
# dfScore = pd.merge(testingSet,dfTrip,on =['id'],how = 'inner')
# dfScore

In [52]:
dfTrip.corr()['duration']

id                 -0.003699
duration            1.000000
start_station_id   -0.007839
end_station_id     -0.006719
bike_id            -0.002303
Name: duration, dtype: float64

In [53]:
dfTrip.loc[dfTrip.id == 192809,:]

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
139621,192809,14530,2/22/2014 15:20,Harry Bridges Plaza (Ferry Building),50,2/22/2014 19:22,Embarcadero at Sansome,60,364,Customer,


In [54]:
trainingSet

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,end_week,end_quarter,end_time,end_hour,end_minute,end_year,end_month,end_day,distance,historical
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,35,3,08:43:00,8,43,2015,8,27,0.018409,1254
1,807007,421,2015-06-15,Harry Bridges Plaza (Ferry Building),50,2015-06-15,Embarcadero at Sansome,60,321,Subscriber,...,25,2,09:11:00,9,11,2015,6,15,0.018409,1254
2,343864,9624,2014-06-29,Harry Bridges Plaza (Ferry Building),50,2014-06-29,Embarcadero at Sansome,60,451,Customer,...,26,2,16:04:00,16,4,2014,6,29,0.018409,1254
3,451164,1670,2014-09-13,Harry Bridges Plaza (Ferry Building),50,2014-09-13,Embarcadero at Sansome,60,564,Customer,...,37,3,04:19:00,4,19,2014,9,13,0.018409,1254
4,101973,484,2013-11-20,Harry Bridges Plaza (Ferry Building),50,2013-11-20,Embarcadero at Sansome,60,416,Customer,...,47,4,09:45:00,9,45,2013,11,20,0.018409,1254
5,502639,363,2014-10-16,Harry Bridges Plaza (Ferry Building),50,2014-10-16,Embarcadero at Sansome,60,401,Subscriber,...,42,4,19:05:00,19,5,2014,10,16,0.018409,1254
6,842301,446,2015-07-11,Harry Bridges Plaza (Ferry Building),50,2015-07-11,Embarcadero at Sansome,60,579,Subscriber,...,28,3,10:46:00,10,46,2015,7,11,0.018409,1254
7,300655,385,2014-05-28,Harry Bridges Plaza (Ferry Building),50,2014-05-28,Embarcadero at Sansome,60,412,Subscriber,...,22,2,08:43:00,8,43,2014,5,28,0.018409,1254
8,781372,388,2015-05-26,Harry Bridges Plaza (Ferry Building),50,2015-05-26,Embarcadero at Sansome,60,587,Subscriber,...,22,2,17:01:00,17,1,2015,5,26,0.018409,1254
9,727968,517,2015-04-16,Harry Bridges Plaza (Ferry Building),50,2015-04-16,Embarcadero at Sansome,60,449,Subscriber,...,16,2,08:53:00,8,53,2015,4,16,0.018409,1254


In [55]:
# GLORIOSO DF DEL TP1
#(0 = Monday, 1 = Tuesday...)
dfSF_Bay = pd.read_csv('../CSVs/dfSF_Bay.csv')
dfSF_Bay

Unnamed: 0,viajes,max_temperature_c,mean_temperature_c,min_temperature_c,max_dew_point_c,mean_dew_point_c,min_dew_point_c,max_humidity,mean_humidity,min_humidity,...,Fog-Rain,Normal,Rain,Rain-Thunderstorm,date,business_day,holiday,year,month,weekday
0,748,23.333333,20.000000,16.111111,16.111111,14.444444,13.333333,93,75,57,...,0,1,0,0,2013-08-29,1,0,2013,8,3
1,714,25.555556,20.555556,15.555556,16.111111,14.444444,13.333333,90,70,50,...,0,1,0,0,2013-08-30,1,0,2013,8,4
2,640,21.666667,17.777778,13.888889,13.888889,13.333333,12.222222,93,75,57,...,0,1,0,0,2013-08-31,0,0,2013,8,5
3,706,23.333333,18.888889,14.444444,15.555556,13.333333,11.666667,87,68,49,...,0,1,0,0,2013-09-01,0,0,2013,9,6
4,661,23.888889,20.555556,16.666667,16.111111,15.555556,14.444444,93,77,61,...,0,1,0,0,2013-09-02,0,1,2013,9,0
5,597,22.777778,19.444444,15.555556,15.000000,13.333333,10.555556,84,65,46,...,0,1,0,0,2013-09-03,1,0,2013,9,1
6,606,23.333333,20.000000,16.111111,15.000000,13.888889,13.333333,90,72,53,...,0,1,0,0,2013-09-04,1,0,2013,9,2
7,677,22.222222,18.888889,15.555556,13.888889,13.333333,12.222222,90,74,57,...,0,1,0,0,2013-09-05,1,0,2013,9,3
8,814,29.444444,21.666667,13.333333,13.888889,10.555556,7.222222,86,58,29,...,0,1,0,0,2013-09-06,1,0,2013,9,4
9,796,31.111111,22.777778,14.444444,17.777778,12.222222,7.777778,86,59,31,...,0,1,0,0,2013-09-07,0,0,2013,9,5


In [56]:
list(dfSF_Bay.columns.values)

['viajes',
 'max_temperature_c',
 'mean_temperature_c',
 'min_temperature_c',
 'max_dew_point_c',
 'mean_dew_point_c',
 'min_dew_point_c',
 'max_humidity',
 'mean_humidity',
 'min_humidity',
 'max_sea_level_pressure_cm',
 'mean_sea_level_pressure_cm',
 'min_sea_level_pressure_cm',
 'max_visibility_km',
 'mean_visibility_km',
 'min_visibility_km',
 'max_wind_Speed_kmh',
 'mean_wind_speed_kmh',
 'max_gust_speed_kmh',
 'precipitation_cm',
 'cloud_cover',
 'wind_dir_degrees',
 'Fog',
 'Fog-Rain',
 'Normal',
 'Rain',
 'Rain-Thunderstorm',
 'date',
 'business_day',
 'holiday',
 'year',
 'month',
 'weekday']

In [57]:
dfSF_Bay.corr()['viajes']

viajes                        1.000000
max_temperature_c             0.241620
mean_temperature_c            0.235992
min_temperature_c             0.179517
max_dew_point_c               0.148207
mean_dew_point_c              0.153746
min_dew_point_c               0.150832
max_humidity                 -0.062236
mean_humidity                -0.075510
min_humidity                 -0.071561
max_sea_level_pressure_cm    -0.121150
mean_sea_level_pressure_cm   -0.096169
min_sea_level_pressure_cm    -0.070266
max_visibility_km            -0.026463
mean_visibility_km            0.108799
min_visibility_km             0.108733
max_wind_Speed_kmh           -0.023622
mean_wind_speed_kmh           0.021424
max_gust_speed_kmh           -0.026345
precipitation_cm             -0.184764
cloud_cover                  -0.042077
wind_dir_degrees              0.160815
Fog                          -0.030221
Fog-Rain                     -0.049191
Normal                        0.129629
Rain                     

In [58]:
dfSF_Bay.loc[dfSF_Bay.month == 2,:].loc[dfSF_Bay.year == 2014,:].loc[dfSF_Bay.weekday == 5,:]

Unnamed: 0,viajes,max_temperature_c,mean_temperature_c,min_temperature_c,max_dew_point_c,mean_dew_point_c,min_dew_point_c,max_humidity,mean_humidity,min_humidity,...,Fog-Rain,Normal,Rain,Rain-Thunderstorm,date,business_day,holiday,year,month,weekday
156,383,15.0,10.555556,5.555556,6.111111,3.888889,2.222222,82,63,43,...,0,1,0,0,2014-02-01,0,0,2014,2,5
163,94,15.0,13.333333,11.666667,14.444444,12.777778,9.444444,100,92,83,...,0,0,1,0,2014-02-08,0,0,2014,2,5
170,415,16.666667,13.888889,11.111111,12.777778,11.111111,9.444444,93,83,72,...,0,0,1,0,2014-02-15,0,0,2014,2,5
177,466,18.888889,13.333333,7.777778,11.111111,8.888889,6.111111,93,78,63,...,0,1,0,0,2014-02-22,0,0,2014,2,5


In [59]:
dfSF_Bay.dtypes

viajes                          int64
max_temperature_c             float64
mean_temperature_c            float64
min_temperature_c             float64
max_dew_point_c               float64
mean_dew_point_c              float64
min_dew_point_c               float64
max_humidity                  float64
mean_humidity                 float64
min_humidity                  float64
max_sea_level_pressure_cm     float64
mean_sea_level_pressure_cm    float64
min_sea_level_pressure_cm     float64
max_visibility_km             float64
mean_visibility_km            float64
min_visibility_km             float64
max_wind_Speed_kmh            float64
mean_wind_speed_kmh           float64
max_gust_speed_kmh            float64
precipitation_cm              float64
cloud_cover                   float64
wind_dir_degrees              float64
Fog                             int64
Fog-Rain                        int64
Normal                          int64
Rain                            int64
Rain-Thunder

In [60]:
# OMITO ESTE FILTRADO
#Elijo las mejores variables en funcion del TP1
dfSF_Bay = dfSF_Bay.loc [:,['viajes','max_temperature_c','min_temperature_c','max_humidity','max_sea_level_pressure_cm','precipitation_cm','Fog','Normal','Rain','business_day','holiday','year','month','weekday','date']]

In [61]:
dfSF_Bay.dtypes

viajes                         int64
max_temperature_c            float64
min_temperature_c            float64
max_humidity                 float64
max_sea_level_pressure_cm    float64
precipitation_cm             float64
Fog                            int64
Normal                         int64
Rain                           int64
business_day                   int64
holiday                        int64
year                           int64
month                          int64
weekday                        int64
date                          object
dtype: object

In [63]:
list(trainingSet.columns.values)

['id',
 'duration',
 'start_date',
 'start_station_name',
 'start_station_id',
 'end_date',
 'end_station_name',
 'end_station_id',
 'bike_id',
 'subscription_type',
 'zip_code',
 'start_dayOfWeek',
 'start_week',
 'start_quarter',
 'start_time',
 'start_hour',
 'start_minute',
 'start_year',
 'start_month',
 'start_day',
 'end_dayOfWeek',
 'end_week',
 'end_quarter',
 'end_time',
 'end_hour',
 'end_minute',
 'end_year',
 'end_month',
 'end_day',
 'distance',
 'historical']

In [36]:
# MOVED UP WHERE DATE  TREATMENT IS DONE

trainingSet['year'] = pd.to_datetime(trainingSet['start_date']).dt.year
trainingSet['month'] = pd.to_datetime(trainingSet['start_date']).dt.month
trainingSet['weekday'] = pd.to_datetime(trainingSet['start_date']).dt.weekday
trainingSet

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,end_hour,end_minute,end_year,end_month,end_day,distance,historical,year,month,weekday
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,8,43,2015,8,27,0.018409,1254,2015,8,3
1,807007,421,2015-06-15,Harry Bridges Plaza (Ferry Building),50,2015-06-15,Embarcadero at Sansome,60,321,Subscriber,...,9,11,2015,6,15,0.018409,1254,2015,6,0
2,343864,9624,2014-06-29,Harry Bridges Plaza (Ferry Building),50,2014-06-29,Embarcadero at Sansome,60,451,Customer,...,16,4,2014,6,29,0.018409,1254,2014,6,6
3,451164,1670,2014-09-13,Harry Bridges Plaza (Ferry Building),50,2014-09-13,Embarcadero at Sansome,60,564,Customer,...,4,19,2014,9,13,0.018409,1254,2014,9,5
4,101973,484,2013-11-20,Harry Bridges Plaza (Ferry Building),50,2013-11-20,Embarcadero at Sansome,60,416,Customer,...,9,45,2013,11,20,0.018409,1254,2013,11,2
5,502639,363,2014-10-16,Harry Bridges Plaza (Ferry Building),50,2014-10-16,Embarcadero at Sansome,60,401,Subscriber,...,19,5,2014,10,16,0.018409,1254,2014,10,3
6,842301,446,2015-07-11,Harry Bridges Plaza (Ferry Building),50,2015-07-11,Embarcadero at Sansome,60,579,Subscriber,...,10,46,2015,7,11,0.018409,1254,2015,7,5
7,300655,385,2014-05-28,Harry Bridges Plaza (Ferry Building),50,2014-05-28,Embarcadero at Sansome,60,412,Subscriber,...,8,43,2014,5,28,0.018409,1254,2014,5,2
8,781372,388,2015-05-26,Harry Bridges Plaza (Ferry Building),50,2015-05-26,Embarcadero at Sansome,60,587,Subscriber,...,17,1,2015,5,26,0.018409,1254,2015,5,1
9,727968,517,2015-04-16,Harry Bridges Plaza (Ferry Building),50,2015-04-16,Embarcadero at Sansome,60,449,Subscriber,...,8,53,2015,4,16,0.018409,1254,2015,4,3


In [37]:
trainingSet = pd.merge(trainingSet,dfSF_Bay,left_on ='start_date',right_on='date',how = 'inner')


In [None]:
trainingSet.dtypes

In [38]:
trainingSet.drop(['year_x','month_x','weekday_x'],1,inplace=True)


In [39]:
trainingSet.drop(['date'],1,inplace=True)


In [40]:
trainingSet = trainingSet.rename(columns={'year_y':'year','month_y':'month','weekday_y': 'weekday'})

In [None]:
trainingSet.dtypes

## Save processed data

In [41]:
trainingSet.to_csv('../CSVs/improved_trip_train.csv', index=False)

In [42]:
testingSet.to_csv('../CSVs/improved_trip_test.csv', index=False)

# //////////////////////////////////////////////////////////////////////////////

# //////////////////////////////////////////////////////////////////////////////

# //////////////////////////////////////////////////////////////////////////////

# Discretizacion y Normalizacion

## Discretizacion

In [43]:
trainingSet = pd.read_csv('../CSVs/improved_trip_train.csv')
testingSet = pd.read_csv('../CSVs/improved_trip_test.csv')

In [None]:
trainingSet.columns.values

In [44]:
def crearLista (listadoCompleto):
    listaReducida = []
    for i in listadoCompleto:
        if i not in listaReducida:
            listaReducida.append(i)
    listaReducida.sort()
    return listaReducida

In [45]:
def discretizar(columna,listaReducida,nombre):
    v = list(range(len(columna)))
    listaCompleta = list(columna)
    for i in listaReducida:
        for j in range(len(listaCompleta)):
            if(listaCompleta[j] == i):
                v[j] = 1
            else:
                v[j] = 0
        trainingSet[nombre+str(i)] = v

In [46]:
listaStartName = crearLista(trainingSet.start_station_name)
listaEndName = crearLista(trainingSet.end_station_name)

In [47]:
discretizar(trainingSet.start_station_name,listaStartName,'start ')

In [None]:
trainingSet

In [None]:
trainingSet.loc[:,['start_station_name','start Harry Bridges Plaza (Ferry Building)']]

In [48]:
discretizar(trainingSet.end_station_name,listaEndName,'end ')

In [49]:
listaStartWeekDay = crearLista(trainingSet.start_dayOfWeek)

In [50]:
discretizar(trainingSet.start_dayOfWeek,listaStartWeekDay,'start_dayOfWeek_id')

In [51]:
listaEndWeekDay = crearLista(trainingSet.end_dayOfWeek)

In [52]:
discretizar(trainingSet.end_dayOfWeek,listaEndWeekDay,'end_dayOfWeek_id')

In [53]:
trainingSet.loc[:,['end_dayOfWeek_id3','end_dayOfWeek']]

Unnamed: 0,end_dayOfWeek_id3,end_dayOfWeek
0,1,3
1,1,3
2,1,3
3,1,3
4,1,3
5,1,3
6,1,3
7,1,3
8,1,3
9,1,3


In [54]:
listaSubscriptionType = crearLista(trainingSet.subscription_type)

In [55]:
discretizar(trainingSet.subscription_type,listaSubscriptionType,'subscription_type_')

In [56]:
trainingSet

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,start_dayOfWeek_id6,end_dayOfWeek_id0,end_dayOfWeek_id1,end_dayOfWeek_id2,end_dayOfWeek_id3,end_dayOfWeek_id4,end_dayOfWeek_id5,end_dayOfWeek_id6,subscription_type_Customer,subscription_type_Subscriber
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,0,0,0,0,1,0,0,0,0,1
1,907702,287,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,334,Subscriber,...,0,0,0,0,1,0,0,0,0,1
2,908937,595,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,503,Customer,...,0,0,0,0,1,0,0,0,1,0
3,908953,439,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,538,Subscriber,...,0,0,0,0,1,0,0,0,0,1
4,909076,872,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,406,Customer,...,0,0,0,0,1,0,0,0,1,0
5,909006,3328,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,400,Customer,...,0,0,0,0,1,0,0,0,1,0
6,908192,767,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,0,0,0,0,1,0,0,0,0,1
7,908938,590,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,557,Customer,...,0,0,0,0,1,0,0,0,1,0
8,909005,3360,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,558,Customer,...,0,0,0,0,1,0,0,0,1,0
9,907428,427,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,331,Subscriber,...,0,0,0,0,1,0,0,0,0,1


In [57]:
trainingSet[trainingSet.zip_code.str.isnumeric() == False].loc[:,['zip_code']]

Unnamed: 0,zip_code
54,nil
99,nil
148,nil
251,nil
375,nil
432,nil
700,nil
701,nil
736,nil
807,nil


In [58]:
listaStartYear = crearLista(trainingSet.start_year)

In [59]:
discretizar(trainingSet.start_year,listaStartYear,'start_year_')

In [60]:
listaEndYear = crearLista(trainingSet.end_year)
discretizar(trainingSet.end_year,listaEndYear,'end_year_')

In [62]:
listaStartMonth = crearLista(trainingSet.start_month)
discretizar(trainingSet.start_month,listaStartMonth,'start_month_')

In [64]:
listaEndMonth = crearLista(trainingSet.end_month)
discretizar(trainingSet.end_month,listaEndMonth,'end_month_')

In [66]:
listaStartDay = crearLista(trainingSet.start_day)
discretizar(trainingSet.start_day,listaStartDay,'start_day_')

In [68]:
listaEndDay = crearLista(trainingSet.end_day)
discretizar(trainingSet.end_day,listaEndDay,'end_day_')

In [70]:
listaStartQuarter = crearLista(trainingSet.start_quarter)
discretizar(trainingSet.start_quarter,listaStartQuarter,'start_quarter_')

In [72]:
listaEndQuarter = crearLista(trainingSet.end_quarter)
discretizar(trainingSet.end_quarter,listaEndQuarter,'end_quarter_')

In [74]:
listaStartHour = crearLista(trainingSet.start_hour)
discretizar(trainingSet.start_hour,listaStartHour,'start_hour_')

In [76]:
listaEndHour = crearLista(trainingSet.end_hour)
discretizar(trainingSet.end_hour,listaEndHour,'end_hour')

In [None]:
trainingSet

In [None]:
trainingSet.columns.values

In [78]:
#A veces tarda a veces no
trainingSet = trainingSet.drop(labels = ['start_date','start_dayOfWeek','start_quarter','start_time','start_station_name','end_date',
                        'end_dayOfWeek','end_quarter','end_time','end_station_name','subscription_type','year','month',
                         'weekday','start_hour','end_hour'],axis = 1)

In [80]:
trainingSet.columns.values

array(['id', 'duration', 'start_station_id', 'end_station_id', 'bike_id',
       'zip_code', 'start_week', 'start_minute', 'start_year',
       'start_month', 'start_day', 'end_week', 'end_minute', 'end_year',
       'end_month', 'end_day', 'distance', 'historical', 'viajes',
       'max_temperature_c', 'min_temperature_c', 'max_humidity',
       'max_sea_level_pressure_cm', 'precipitation_cm', 'Fog', 'Normal',
       'Rain', 'business_day', 'holiday', 'start 2nd at Folsom',
       'start 2nd at South Park', 'start 2nd at Townsend',
       'start 5th at Howard', 'start Adobe on Almaden',
       'start Arena Green / SAP Center', 'start Beale at Market',
       'start Broadway St at Battery St', 'start Broadway at Main',
       'start California Ave Caltrain Station',
       'start Castro Street and El Camino Real',
       'start Civic Center BART (7th at Market)', 'start Clay at Battery',
       'start Commercial at Montgomery', 'start Cowper at University',
       'start Davis at Jacks

In [None]:
trainingSet

## Save processed data

In [81]:
trainingSet.to_csv('../CSVs/improved_trip_train.csv', index=False)

## Normalizacion

In [82]:
durationNormalize = preprocessing.normalize(trainingSet.duration)
trainingSet['duration'] = durationNormalize[0]
maxTemperatureNormalize = preprocessing.normalize(trainingSet.max_temperature_c)
trainingSet['max_temperature_c'] = maxTemperatureNormalize[0]
minTemperatureNormalize= preprocessing.normalize(trainingSet.min_temperature_c)
trainingSet['min_temperature_c'] = minTemperatureNormalize[0]
maxHumidityNormalize = preprocessing.normalize(trainingSet.max_humidity)
trainingSet['max_humidity'] = maxHumidityNormalize[0]
maxSeaLevelPressureNormalize = preprocessing.normalize(trainingSet.max_sea_level_pressure_cm)
trainingSet['max_sea_level_pressure_cm'] = maxSeaLevelPressureNormalize[0]
precipitationNormalize = preprocessing.normalize(trainingSet.precipitation_cm)
trainingSet['precipitation_cm'] = precipitationNormalize[0]



In [83]:
trainingSet.loc[:,['duration','max_temperature_c','min_temperature_c','max_humidity','max_sea_level_pressure_cm','precipitation_cm']]

Unnamed: 0,duration,max_temperature_c,min_temperature_c,max_humidity,max_sea_level_pressure_cm,precipitation_cm
0,0.000022,0.001853,0.001699,0.001317,0.001348,0.000000
1,0.000016,0.001853,0.001699,0.001317,0.001348,0.000000
2,0.000033,0.001853,0.001699,0.001317,0.001348,0.000000
3,0.000024,0.001853,0.001699,0.001317,0.001348,0.000000
4,0.000048,0.001853,0.001699,0.001317,0.001348,0.000000
5,0.000184,0.001853,0.001699,0.001317,0.001348,0.000000
6,0.000042,0.001853,0.001699,0.001317,0.001348,0.000000
7,0.000033,0.001853,0.001699,0.001317,0.001348,0.000000
8,0.000186,0.001853,0.001699,0.001317,0.001348,0.000000
9,0.000024,0.001853,0.001699,0.001317,0.001348,0.000000


# ///////////////////////////////////////////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////////////////////////////////////////
# ///////////////////////////////////////////////////////////////////////////////////////////////
# CORRER A PARTIR DE ACA

# Filtrado de OutLiers

In [85]:
trainingSet = pd.read_csv('../CSVs/improved_trip_train.csv')

In [None]:
trainingSet

In [None]:
trainingSet.columns.values

In [None]:
trainingSet.duration.describe()

In [None]:
trainingSet.loc[:,['duration','start_year','start_month','start_day','end_year','end_month','end_day']].sort_values(by='duration',ascending = False).head(200)

In [None]:
duration = list(trainingSet.duration.sort_values(ascending = False).head(100))

In [None]:
import plotly 
import numpy as np
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.graph_objs import *
plotly.offline.init_notebook_mode()
plotly.tools.set_credentials_file(username='AARTURI', api_key='qiQqOxKJXDlziMFzaB8j')
plotly.offline.init_notebook_mode(connected=True)

In [None]:
trace0 = go.Scatter(
    y = duration,
    mode = 'markers',
    name = 'h=0.1'
)


data = [trace0]
fig = Figure(data=data)
plotly.offline.iplot(fig, filename='styled-scatter')

In [None]:
trainingSet = trainingSet.loc[trainingSet.duration < 1000000,:]

In [None]:
trainingSet

In [None]:
trainingSet[trainingSet['duration'] > 1000000]

## Save processed data

In [None]:
trainingSet.to_csv('../CSVs/improved_trip_train.csv', index=False)