# TP2: Machine Learning

### Imports

In [1]:
import pandas as pd
from datetime import datetime
import scipy.spatial
from sklearn import preprocessing

### Data loading

In [2]:
print "*****Data loading*****"
print "loading station csv"
stationDF = pd.read_csv('../CSVs/station.csv')
print "loading trip train csv"
trainingSet = pd.read_csv('../CSVs/trip_train.csv')
print "loading trip test csv"
testingSet = pd.read_csv('../CSVs/trip_test.csv')

# GLORIOSO DF DEL TP1
# (0 = Monday, 1 = Tuesday...)
print "loading dfSF_Bay csv"
dfSF_Bay = pd.read_csv('../CSVs/dfSF_Bay.csv')

*****Data loading*****
loading station csv
loading trip train csv
loading trip test csv
loading dfSF_Bay csv


## Basic data analysis

In [3]:
print "stationDF.shape: ", stationDF.shape
print "trainingSet.shape: ", trainingSet.shape
print "testingSet.shape: ", testingSet.shape
print "dfSF_Bay.shape: ", dfSF_Bay.shape

stationDF.shape:  (70, 7)
trainingSet.shape:  (549961, 11)
testingSet.shape:  (119998, 10)
dfSF_Bay.shape:  (733, 33)


### Distancias entre estaciones

In [4]:
print "*****Working on station distances*****"
# Create new temporary dataframe with distances
distancesDF = pd.DataFrame(columns=["start_station_id", "end_station_id", "distance"])

# Calculate distances between stations
for station, lat, lon in zip(stationDF.id, stationDF.lat, stationDF.long):
    for station2, lat2, lon2 in zip(stationDF.id, stationDF.lat, stationDF.long):
        distancesDF = distancesDF.append({
            "start_station_id": station,
            "end_station_id": station2,
            "distance": scipy.spatial.distance.cityblock([lat, lon], [lat2, lon2])
        }, ignore_index=True)

distancesDF['start_station_id'] = distancesDF.start_station_id.astype(int)
distancesDF['end_station_id'] = distancesDF.end_station_id.astype(int)

# Merge this new data to training and testing sets
trainingSet = pd.merge(trainingSet,distancesDF,on =['start_station_id','end_station_id'],how = 'inner')
testingSet = pd.merge(testingSet,distancesDF,on =['start_station_id','end_station_id'],how = 'inner')

# delete auxiliary distances df
del distancesDF

*****Working on station distances*****


### Process date & time data

In [5]:
print "*****Converting necessary data to dateTime*****"
# Convert necessary data to dateTime
dfSF_Bay['date'] = pd.to_datetime(dfSF_Bay.date)

trainingSet['start_date'] = pd.to_datetime(trainingSet.start_date)
trainingSet['end_date'] = pd.to_datetime(trainingSet.end_date)

testingSet['start_date'] = pd.to_datetime(testingSet.start_date)
testingSet['end_date'] = pd.to_datetime(testingSet.end_date)

# Create new features related to date & time based on the unique 'date' feature
# Work with training set
trainingSet['start_dayOfWeek'] = trainingSet.start_date.dt.dayofweek
trainingSet['start_week'] = trainingSet.start_date.dt.week
trainingSet['start_quarter'] = trainingSet.start_date.dt.quarter
trainingSet['start_time'] = trainingSet.start_date.dt.time
trainingSet['start_hour'] = trainingSet.start_date.dt.hour
trainingSet['start_minute'] = trainingSet.start_date.dt.minute
trainingSet['start_year'] = trainingSet.start_date.dt.year
trainingSet['start_month'] = trainingSet.start_date.dt.month
trainingSet['start_day'] = trainingSet.start_date.dt.day
trainingSet['start_date'] = trainingSet.start_date.dt.date

trainingSet['end_dayOfWeek'] = trainingSet.end_date.dt.dayofweek
trainingSet['end_week'] = trainingSet.end_date.dt.week
trainingSet['end_quarter'] = trainingSet.end_date.dt.quarter
trainingSet['end_time'] = trainingSet.end_date.dt.time
trainingSet['end_hour'] = trainingSet.end_date.dt.hour
trainingSet['end_minute'] = trainingSet.end_date.dt.minute
trainingSet['end_year'] = trainingSet.end_date.dt.year
trainingSet['end_month'] = trainingSet.end_date.dt.month
trainingSet['end_day'] = trainingSet.end_date.dt.day
trainingSet['end_date'] = trainingSet.end_date.dt.date

trainingSet['year'] = pd.to_datetime(trainingSet['start_date']).dt.year
trainingSet['month'] = pd.to_datetime(trainingSet['start_date']).dt.month
trainingSet['weekday'] = pd.to_datetime(trainingSet['start_date']).dt.weekday

# Work with testing set
testingSet['start_dayOfWeek'] = testingSet.start_date.dt.dayofweek
testingSet['start_week'] = testingSet.start_date.dt.week
testingSet['start_quarter'] = testingSet.start_date.dt.quarter
testingSet['start_time'] = testingSet.start_date.dt.time
testingSet['start_hour'] = testingSet.start_date.dt.hour
testingSet['start_minute'] = testingSet.start_date.dt.minute
testingSet['start_year'] = testingSet.start_date.dt.year
testingSet['start_month'] = testingSet.start_date.dt.month
testingSet['start_day'] = testingSet.start_date.dt.day
testingSet['start_date'] = testingSet.start_date.dt.date

testingSet['end_dayOfWeek'] = testingSet.end_date.dt.dayofweek
testingSet['end_week'] = testingSet.end_date.dt.week
testingSet['end_quarter'] =testingSet.end_date.dt.quarter
testingSet['end_time'] = testingSet.end_date.dt.time
testingSet['end_hour'] = testingSet.end_date.dt.hour
testingSet['end_minute'] = testingSet.end_date.dt.minute
testingSet['end_year'] = testingSet.end_date.dt.year
testingSet['end_month'] = testingSet.end_date.dt.month
testingSet['end_day'] = testingSet.end_date.dt.day
testingSet['end_date'] = testingSet.end_date.dt.date

testingSet['year'] = pd.to_datetime(testingSet['start_date']).dt.year
testingSet['month'] = pd.to_datetime(testingSet['start_date']).dt.month
testingSet['weekday'] = pd.to_datetime(testingSet['start_date']).dt.weekday

*****Converting necessary data to dateTime*****


In [None]:
print "trainingSet cols values", list(trainingSet.columns.values)

In [None]:
print "testingSet cols values", list(testingSet.columns.values)

In [6]:
print "filtrando por duracion..."
trainingSet = trainingSet.loc[trainingSet.duration < 10000,:]

filtrando por duracion...


## Feature Historico

In [7]:
print "*****Working on historic feature*****"
print "***Calculating historic feature***"
import math
listaStart = []
listaEnd = []
for i in list(trainingSet.start_station_id.values):
    if i not in listaStart:
        listaStart.append(i)
for i in list(trainingSet.end_station_id.values):
    if i not in listaEnd:
        listaEnd.append(i)
listaHistorico = []
for i in listaStart:
    for j in listaEnd:
        df = trainingSet[(trainingSet['start_station_id'] == i) & (trainingSet['end_station_id'] == j)]
        historico = df.duration.mean()
        if (not(math.isnan(historico))):
            listaHistorico.append([i,j,historico])
        
listaHistorico

*****Working on historic feature*****
***Calculating historic feature***


[[50, 60, 997.6159711075442],
 [50, 46, 1447.6062176165804],
 [50, 62, 584.3501945525292],
 [50, 55, 367.1315240083507],
 [50, 39, 1244.8688212927757],
 [50, 70, 858.1191350469196],
 [50, 61, 642.3745454545455],
 [50, 75, 552.8974820143885],
 [50, 67, 1317.0813559322035],
 [50, 72, 1420.1782945736434],
 [50, 51, 584.8457142857143],
 [50, 76, 944.072463768116],
 [50, 50, 3152.187165775401],
 [50, 66, 2004.3636363636363],
 [50, 54, 608.0723270440252],
 [50, 64, 593.7965653896962],
 [50, 65, 1012.4814108674929],
 [50, 45, 523.9376558603491],
 [50, 77, 689.5460122699386],
 [50, 82, 646.9190476190477],
 [50, 74, 1993.2432432432433],
 [50, 68, 695.4463452566097],
 [50, 71, 1655.6514745308311],
 [50, 56, 839.5565217391304],
 [50, 49, 453.73065015479875],
 [50, 57, 843.6209677419355],
 [50, 63, 465.7516233766234],
 [50, 48, 1426.6996086105676],
 [50, 69, 803.9622475856014],
 [50, 47, 738.7462311557789],
 [50, 42, 544.7235772357724],
 [50, 41, 952.7511312217194],
 [50, 73, 1767.9779735682819],


In [8]:
starStationId = []
endStationId = []
historical = []
for x in listaHistorico:
    starStationId.append(x[0])
    endStationId.append(x[1])
    historical.append(x[2])

data = {
    'start_station_id' : starStationId,
    'end_station_id' : endStationId,
    'historical' : historical,
}

dfData = pd.DataFrame(data,columns = ['start_station_id','end_station_id','historical'])
dfData

Unnamed: 0,start_station_id,end_station_id,historical
0,50,60,997.615971
1,50,46,1447.606218
2,50,62,584.350195
3,50,55,367.131524
4,50,39,1244.868821
5,50,70,858.119135
6,50,61,642.374545
7,50,75,552.897482
8,50,67,1317.081356
9,50,72,1420.178295


In [9]:
print "**Merging historic feature**"
# Merge this new data to training and testing dfs
# Training
trainingSet = pd.merge(trainingSet,dfData,on=['start_station_id', 'end_station_id'],how = 'inner') 

trainingSet['historical'] = trainingSet.historical.astype(int)

# Testing
testingSet = pd.merge(testingSet, dfData, on=['start_station_id', 'end_station_id'], how='inner')

testingSet['historical'] = testingSet.historical.astype(int)

# delete auxiliar dataframe
del dfData

**Merging historic feature**


In [10]:
print "trainingSet.shape: ", trainingSet.shape
print "testingSet.shape: ", testingSet.shape

trainingSet.shape:  (542525, 34)
testingSet.shape:  (119979, 33)


The difference in the shapes is due to the duration feature used in the training set, which was used to calculate the historical feature.

### Trabajamos con dfSF_Bay

In [11]:
# Convert necessary data to dateTime
dfSF_Bay['date'] = pd.to_datetime(dfSF_Bay.date)

trainingSet['start_date'] = pd.to_datetime(trainingSet.start_date)
trainingSet['end_date'] = pd.to_datetime(trainingSet.end_date)

testingSet['start_date'] = pd.to_datetime(testingSet.start_date)
testingSet['end_date'] = pd.to_datetime(testingSet.end_date)

In [12]:
print "***Merging dfSF_Bay data***"
# Merge trainingSet with new data

testingSet = pd.merge(testingSet,dfSF_Bay,left_on ='start_date',right_on='date',how = 'inner')
trainingSet = pd.merge(trainingSet,dfSF_Bay,left_on ='start_date',right_on='date',how = 'inner')

***Merging dfSF_Bay data***


In [13]:
testingSet

Unnamed: 0,id,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code,...,Fog-Rain,Normal,Rain,Rain-Thunderstorm,date,business_day,holiday,year_y,month_y,weekday_y
0,504737,2014-10-18,Embarcadero at Sansome,60,2014-10-18,Powell at Post (Union Square),71,426,Customer,77009,...,0,1,0,0,2014-10-18,0,0,2014,10,5
1,505019,2014-10-18,San Francisco Caltrain 2 (330 Townsend),69,2014-10-18,Embarcadero at Folsom,51,376,Subscriber,94705,...,0,1,0,0,2014-10-18,0,0,2014,10,5
2,504972,2014-10-18,Embarcadero at Vallejo,48,2014-10-18,Steuart at Market,74,559,Customer,20740,...,0,1,0,0,2014-10-18,0,0,2014,10,5
3,504943,2014-10-18,Powell Street BART,39,2014-10-18,Market at 10th,67,553,Customer,80211,...,0,1,0,0,2014-10-18,0,0,2014,10,5
4,504779,2014-10-18,Powell at Post (Union Square),71,2014-10-18,San Francisco Caltrain 2 (330 Townsend),69,623,Subscriber,94107,...,0,1,0,0,2014-10-18,0,0,2014,10,5
5,504906,2014-10-18,Embarcadero at Sansome,60,2014-10-18,Embarcadero at Sansome,60,477,Customer,92101,...,0,1,0,0,2014-10-18,0,0,2014,10,5
6,505015,2014-10-18,Embarcadero at Sansome,60,2014-10-18,Embarcadero at Sansome,60,352,Customer,90024,...,0,1,0,0,2014-10-18,0,0,2014,10,5
7,504853,2014-10-18,Harry Bridges Plaza (Ferry Building),50,2014-10-18,Embarcadero at Sansome,60,586,Subscriber,94606,...,0,1,0,0,2014-10-18,0,0,2014,10,5
8,504990,2014-10-18,Harry Bridges Plaza (Ferry Building),50,2014-10-18,Embarcadero at Sansome,60,442,Customer,94022,...,0,1,0,0,2014-10-18,0,0,2014,10,5
9,504800,2014-10-18,Harry Bridges Plaza (Ferry Building),50,2014-10-18,Embarcadero at Sansome,60,362,Customer,94553,...,0,1,0,0,2014-10-18,0,0,2014,10,5


In [14]:
trainingSet

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,Fog-Rain,Normal,Rain,Rain-Thunderstorm,date,business_day,holiday,year_y,month_y,weekday_y
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
1,907702,287,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,334,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
2,908937,595,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,503,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
3,908953,439,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,538,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
4,909076,872,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,406,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
5,909006,3328,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,400,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
6,908192,767,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
7,908938,590,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,557,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
8,909005,3360,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,558,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
9,907428,427,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,331,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3


# Discretizacion y Normalizacion

## Discretizacion

In [23]:
(trainingSet.dtypes)

id                              int64
duration                        int64
start_station_id                int64
end_station_id                  int64
bike_id                         int64
distance                      float64
start_week                      int64
start_minute                    int64
end_week                        int64
end_minute                      int64
year_x                          int64
month_x                         int64
weekday_x                       int64
historical                      int64
viajes                          int64
max_temperature_c             float64
mean_temperature_c            float64
min_temperature_c             float64
max_dew_point_c               float64
mean_dew_point_c              float64
min_dew_point_c               float64
max_humidity                  float64
mean_humidity                 float64
min_humidity                  float64
max_sea_level_pressure_cm     float64
mean_sea_level_pressure_cm    float64
min_sea_leve

In [24]:
list(testingSet.columns.values)

['id',
 'start_station_id',
 'end_station_id',
 'bike_id',
 'distance',
 'start_week',
 'start_minute',
 'end_week',
 'end_minute',
 'year_x',
 'month_x',
 'weekday_x',
 'historical',
 'viajes',
 'max_temperature_c',
 'mean_temperature_c',
 'min_temperature_c',
 'max_dew_point_c',
 'mean_dew_point_c',
 'min_dew_point_c',
 'max_humidity',
 'mean_humidity',
 'min_humidity',
 'max_sea_level_pressure_cm',
 'mean_sea_level_pressure_cm',
 'min_sea_level_pressure_cm',
 'max_visibility_km',
 'mean_visibility_km',
 'min_visibility_km',
 'max_wind_Speed_kmh',
 'mean_wind_speed_kmh',
 'max_gust_speed_kmh',
 'precipitation_cm',
 'cloud_cover',
 'wind_dir_degrees',
 'Fog',
 'Fog-Rain',
 'Normal',
 'Rain',
 'Rain-Thunderstorm',
 'date',
 'business_day',
 'holiday',
 'year_y',
 'month_y',
 'weekday_y',
 'start_dayOfWeek_id0',
 'start_dayOfWeek_id1',
 'start_dayOfWeek_id2',
 'start_dayOfWeek_id3',
 'start_dayOfWeek_id4',
 'start_dayOfWeek_id5',
 'start_dayOfWeek_id6',
 'end_dayOfWeek_id0',
 'end_dayOf

In [17]:
print "*****Discretizacion y Normalizacion*****"
print "*****Discretizacion*****"

*****Discretizacion y Normalizacion*****
*****Discretizacion*****


In [18]:
def crearLista (listadoCompleto):
    listaReducida = []
    for i in listadoCompleto:
        if i not in listaReducida:
            listaReducida.append(i)
    listaReducida.sort()
    return listaReducida

In [19]:
def discretizar(columna,nombre, df):
    listaReducida = crearLista(columna)
    v = list(range(len(columna)))
    listaCompleta = list(columna)
    for i in listaReducida:
        for j in range(len(listaCompleta)):
            if(listaCompleta[j] == i):
                v[j] = 1
            else:
                v[j] = 0
        df[nombre+str(i)] = v

In [20]:
# print "Discretizando start_station_name..."
# discretizar(trainingSet.start_station_name,'start ', trainingSet)
# discretizar(testingSet.start_station_name,'start ', testingSet)

# print "Discretizando end_station_name..."
# discretizar(trainingSet.end_station_name,'end ', trainingSet)
# discretizar(testingSet.end_station_name,'end ', testingSet)

print "Discretizando start_dayOfWeek..."
discretizar(trainingSet.start_dayOfWeek,'start_dayOfWeek_id', trainingSet)
discretizar(testingSet.start_dayOfWeek,'start_dayOfWeek_id', testingSet)

print "Discretizando end_dayOfWeek..."
discretizar(trainingSet.end_dayOfWeek,'end_dayOfWeek_id', trainingSet)
discretizar(testingSet.end_dayOfWeek,'end_dayOfWeek_id', testingSet)

print "Discretizando subscription_type_..."
discretizar(trainingSet.subscription_type,'subscription_type_', trainingSet)
discretizar(testingSet.subscription_type,'subscription_type_', testingSet)

print "Discretizando start_year..."
discretizar(trainingSet.start_year,'start_year_', trainingSet)
discretizar(testingSet.start_year,'start_year_', testingSet)

print "Discretizando end_year_..."
discretizar(trainingSet.end_year,'end_year_', trainingSet)
discretizar(testingSet.end_year,'end_year_', testingSet)

print "Discretizando start_month..."
discretizar(trainingSet.start_month,'start_month_', trainingSet)
discretizar(testingSet.start_month,'start_month_', testingSet)

print "Discretizando end_month..."
discretizar(trainingSet.end_month,'end_month_', trainingSet)
discretizar(testingSet.end_month,'end_month_', testingSet)

print "Discretizando start_day..."
discretizar(trainingSet.start_day,'start_day_', trainingSet)
discretizar(testingSet.start_day,'start_day_', testingSet)

print "Discretizando end_day..."
discretizar(trainingSet.end_day,'end_day_', trainingSet)
discretizar(testingSet.end_day,'end_day_', testingSet)

print "Discretizando start_quarter..."
discretizar(trainingSet.start_quarter,'start_quarter_', trainingSet)
discretizar(testingSet.start_quarter,'start_quarter_', testingSet)

print "Discretizando end_quarter..."
discretizar(trainingSet.end_quarter,'end_quarter_', trainingSet)
discretizar(testingSet.end_quarter,'end_quarter_', testingSet)

print "Discretizando start_hour..."
discretizar(trainingSet.start_hour,'start_hour_', trainingSet)
discretizar(testingSet.start_hour,'start_hour_', testingSet)

print "Discretizando end_hour..."
discretizar(trainingSet.end_hour,'end_hour', trainingSet)
discretizar(testingSet.end_hour,'end_hour', testingSet)

Discretizando start_dayOfWeek...
Discretizando end_dayOfWeek...
Discretizando subscription_type_...
Discretizando start_year...
Discretizando end_year_...
Discretizando start_month...
Discretizando end_month...
Discretizando start_day...
Discretizando end_day...
Discretizando start_quarter...
Discretizando end_quarter...
Discretizando start_hour...
Discretizando end_hour...


In [21]:
print "Dropping trash columns..."
trainingSet = trainingSet.drop(labels = ['start_date', 
                                         'end_station_name',
                                         'start_station_name',
                                         'end_date',
                                         'subscription_type',
                                         'zip_code',
                                         'start_time',
                                         'end_time',
                                         'start_dayOfWeek',
                                         'end_dayOfWeek',
                                         'start_year',
                                         'end_year',
                                         'start_month',
                                         'end_month',
                                         'start_day',
                                         'end_day',
                                         'start_quarter',
                                         'end_quarter',
                                         'start_hour',
                                         'end_hour'
                                        ],axis = 1)

testingSet = testingSet.drop(labels = ['start_date', 
                                         'end_station_name',
                                         'start_station_name',
                                         'end_date',
                                         'subscription_type',
                                         'zip_code',
                                         'start_time',
                                         'end_time',
                                         'start_dayOfWeek',
                                         'end_dayOfWeek',
                                         'start_year',
                                         'end_year',
                                         'start_month',
                                         'end_month',
                                         'start_day',
                                         'end_day',
                                         'start_quarter',
                                         'end_quarter',
                                         'start_hour',
                                         'end_hour'
                                        ],axis = 1)

Dropping trash columns...


In [22]:
print "trainingSet.shape: ", trainingSet.shape
print "testingSet.shape: ", testingSet.shape

trainingSet.shape:  (542525, 211)
testingSet.shape:  (119979, 210)


In [None]:
# THIS CELL SHOULD NOT BE COPIED. THIS CELL IS ALLOWED TO BE PRESENT ONLY ONCE IN ALL THE NOTEBOOK
print "Saving temp csvs..."
trainingSet.to_csv('../CSVs/tempTraining.csv')
testingSet.to_csv('../CSVs/tempTesting.csv')

In [None]:
trainingSet.to_pickle('../CSVs/finalTraining')
testingSet.to_pickle('../CSVs/finalTesting')

In [None]:
trainingSet = pd.read_csv('../CSVs/tempTraining.csv')
testingSet = pd.read_csv('../CSVs/tempTesting.csv')

# Data filtering

In [25]:
trainingSet.drop(['Unnamed: 0'],1,inplace=True)
testingSet.drop(['Unnamed: 0'],1,inplace=True)

ValueError: labels ['Unnamed: 0'] not contained in axis

In [None]:
# Delete repeated data
trainingSet.drop(['date','year_y','month_y','weekday_y'],1,inplace=True)
trainingSet = trainingSet.rename(columns={'year_x':'year','month_x':'month','weekday_x': 'weekday'})

testingSet.drop(['date','year_y','month_y','weekday_y'],1,inplace=True)
testingSet = testingSet.rename(columns={'year_x':'year','month_x':'month','weekday_x': 'weekday'})

In [None]:
# Delete:
#     id: el id que identifica univocamente cada uno de los viajes 
#         no proporciona informacion con la que el algoritmo pueda aprender
#     start_station_id y end_station_id: las estaciones ya estan discretizadas por nombre
trainingSet.drop(['id', 'start_station_id','end_station_id'],1,inplace=True)

testingSet.drop(['id', 'start_station_id','end_station_id'],1,inplace=True)

In [None]:
# Delete: 
#     bike_id: la duracion del viaje es independiente de la bicicleta, 
#         ya que son todas iguales ("") y se entregan sin juicio alguno ("")
trainingSet.drop(['bike_id'],1,inplace=True)

testingSet.drop(['bike_id'],1,inplace=True)

In [None]:
# Delete: 
#     los dias como numero no aportan nada. E.g. 1 puede ser cualquier dia de la semana. 
#     Nada en la vida se decide en base a numero de dia (salvo los ñoquis del 29 y martes/viernes 13). 
trainingSet.drop(['start_day_1','start_day_2', 'start_day_3','start_day_4', 'start_day_5','start_day_6','start_day_7', 
                  'start_day_8','start_day_9', 'start_day_10','start_day_11', 'start_day_12','start_day_13', 
                  'start_day_14','start_day_15','start_day_16', 'start_day_17','start_day_18','start_day_19',
                  'start_day_20','start_day_21','start_day_22', 'start_day_23','start_day_24','start_day_25', 
                  'start_day_26','start_day_27','start_day_28','start_day_29','start_day_30',
                  'start_day_31'],1,inplace=True)
trainingSet.drop(['end_day_1','end_day_2','end_day_3','end_day_4','end_day_5','end_day_6','end_day_7','end_day_8',
                  'end_day_9','end_day_10', 'end_day_11','end_day_12', 'end_day_13','end_day_14','end_day_15',
                  'end_day_16','end_day_17','end_day_18','end_day_19','end_day_20','end_day_21','end_day_22',
                  'end_day_23','end_day_24','end_day_25','end_day_26','end_day_27','end_day_28','end_day_29',
                  'end_day_30','end_day_31',],1,inplace=True)

testingSet.drop(['start_day_1','start_day_2', 'start_day_3','start_day_4', 'start_day_5','start_day_6','start_day_7', 
                  'start_day_8','start_day_9', 'start_day_10','start_day_11', 'start_day_12','start_day_13', 
                  'start_day_14','start_day_15','start_day_16', 'start_day_17','start_day_18','start_day_19',
                  'start_day_20','start_day_21','start_day_22', 'start_day_23','start_day_24','start_day_25', 
                  'start_day_26','start_day_27','start_day_28','start_day_29','start_day_30',
                  'start_day_31'],1,inplace=True)
testingSet.drop(['end_day_1','end_day_2','end_day_3','end_day_4','end_day_5','end_day_6','end_day_7','end_day_8',
                  'end_day_9','end_day_10', 'end_day_11','end_day_12', 'end_day_13','end_day_14','end_day_15',
                  'end_day_16','end_day_17','end_day_18','end_day_19','end_day_20','end_day_21','end_day_22',
                  'end_day_23','end_day_24','end_day_25','end_day_26','end_day_27','end_day_28','end_day_29',
                  'end_day_30','end_day_31',],1,inplace=True)

In [None]:
# Delete:
#     La duracion del viaje no puede depender de algo del final del mismo. 
#     Asi como el hambre que tengo no depende de lo que haya comido cuado ya comi, 
#     la duracion del viaje no puede depender del momento final del mismo. 
#     De la misma manera, razonando analogamente, podemos concluir que contrario a esto, 
#     si influye el instante inicial del mismo
#     Retiro lo dicho para la estacion final, quedando valido el razonamiento unicamente 
#     para cuestiones temporales. Aun asi esto esta abierto a discusion.
trainingSet.drop(['end_week', 'end_minute','end_dayOfWeek_id0','end_dayOfWeek_id1','end_dayOfWeek_id2',
                  'end_dayOfWeek_id3','end_dayOfWeek_id4','end_dayOfWeek_id5','end_dayOfWeek_id6', 'end_year_2013',
                  'end_year_2014','end_year_2015','end_month_1','end_month_2','end_month_3','end_month_4',
                  'end_month_5','end_month_6','end_month_7','end_month_8','end_month_9','end_month_10',
                  'end_month_11','end_month_12','end_quarter_1','end_quarter_2','end_quarter_3','end_quarter_4',  'end_hour0',
                  'end_hour1', 'end_hour2','end_hour3','end_hour4','end_hour5','end_hour6','end_hour7','end_hour8','end_hour9',
                  'end_hour10','end_hour11','end_hour12','end_hour13','end_hour14','end_hour15','end_hour16',
                  'end_hour17','end_hour18','end_hour19','end_hour20','end_hour21','end_hour22','end_hour23' ],
                 1,inplace=True)

testingSet.drop(['end_week', 'end_minute','end_dayOfWeek_id0','end_dayOfWeek_id1','end_dayOfWeek_id2',
                  'end_dayOfWeek_id3','end_dayOfWeek_id4','end_dayOfWeek_id5','end_dayOfWeek_id6', 'end_year_2013',
                  'end_year_2014','end_year_2015','end_month_1','end_month_2','end_month_3','end_month_4',
                  'end_month_5','end_month_6','end_month_7','end_month_8','end_month_9','end_month_10',
                  'end_month_11','end_month_12','end_quarter_1','end_quarter_2','end_quarter_3','end_quarter_4',  'end_hour0',
                  'end_hour1', 'end_hour2','end_hour3','end_hour4','end_hour5','end_hour6','end_hour7','end_hour8','end_hour9',
                  'end_hour10','end_hour11','end_hour12','end_hour13','end_hour14','end_hour15','end_hour16',
                  'end_hour17','end_hour18','end_hour19','end_hour20','end_hour21','end_hour22','end_hour23' ],
                 1,inplace=True)

In [None]:
list(trainingSet.columns.values)

Dejo como incognita 'viajes'. No se realmente que efecto puede tener. Algo me dice que es cualquier cosa, pero no lo sabria justificar.

In [None]:
print sorted(trainingSet.business_day.unique())

In [None]:
trainingSet.Normal

In [None]:
trainingSet.shape

In [None]:
testingSet.shape

## Normalizacion

In [None]:
print "*****Normalizacion*****"

In [None]:
print "filtrando por duracion..."
trainingSet = trainingSet.loc[trainingSet.duration < 1000000,:]

In [None]:
trainingSet

In [None]:
# !!!!!!!! U L T I M A   C E L D A  !!!!!!!!
print "Saving to new csvs..."
print "Saving trainingSet to ../CSVs/finalTraining.csv..."
trainingSet.to_csv('../CSVs/finalTraining.csv')
print "Saving testingSet to ../CSVs/finalTesting.csv..."
testingSet.to_csv('../CSVs/finalTesting.csv')