# TP2: Machine Learning

### Imports

In [14]:
import pandas as pd
from datetime import datetime
import scipy.spatial
from sklearn import preprocessing

### Data loading

In [15]:
print "*****Data loading*****"
print "loading station csv"
stationDF = pd.read_csv('../CSVs/station.csv')
print "loading trip train csv"
trainingSet = pd.read_csv('../CSVs/trip_train.csv')
print "loading trip test csv"
testingSet = pd.read_csv('../CSVs/trip_test.csv')

# GLORIOSO DF DEL TP1
# (0 = Monday, 1 = Tuesday...)
print "loading dfSF_Bay csv"
dfSF_Bay = pd.read_csv('../CSVs/dfSF_Bay.csv')

*****Data loading*****
loading station csv
loading trip train csv
loading trip test csv
loading dfSF_Bay csv


## Basic data analysis

In [16]:
print "stationDF.shape: ", stationDF.shape
print "trainingSet.shape: ", trainingSet.shape
print "testingSet.shape: ", testingSet.shape
print "dfSF_Bay.shape: ", dfSF_Bay.shape

stationDF.shape:  (70, 7)
trainingSet.shape:  (549961, 11)
testingSet.shape:  (119998, 10)
dfSF_Bay.shape:  (733, 33)


### Distancias entre estaciones

In [17]:
print "*****Working on station distances*****"
# Create new temporary dataframe with distances
distancesDF = pd.DataFrame(columns=["start_station_id", "end_station_id", "distance"])

# Calculate distances between stations
for station, lat, lon in zip(stationDF.id, stationDF.lat, stationDF.long):
    for station2, lat2, lon2 in zip(stationDF.id, stationDF.lat, stationDF.long):
        distancesDF = distancesDF.append({
            "start_station_id": station,
            "end_station_id": station2,
            "distance": scipy.spatial.distance.cityblock([lat, lon], [lat2, lon2])
        }, ignore_index=True)

distancesDF['start_station_id'] = distancesDF.start_station_id.astype(int)
distancesDF['end_station_id'] = distancesDF.end_station_id.astype(int)

# Merge this new data to training and testing sets
trainingSet = pd.merge(trainingSet,distancesDF,on =['start_station_id','end_station_id'],how = 'inner')
testingSet = pd.merge(testingSet,distancesDF,on =['start_station_id','end_station_id'],how = 'inner')

# delete auxiliary distances df
del distancesDF

*****Working on station distances*****


### Process date & time data

In [18]:
print "*****Converting necessary data to dateTime*****"
# Convert necessary data to dateTime
dfSF_Bay['date'] = pd.to_datetime(dfSF_Bay.date)

trainingSet['start_date'] = pd.to_datetime(trainingSet.start_date)
trainingSet['end_date'] = pd.to_datetime(trainingSet.end_date)

testingSet['start_date'] = pd.to_datetime(testingSet.start_date)
testingSet['end_date'] = pd.to_datetime(testingSet.end_date)

# Create new features related to date & time based on the unique 'date' feature
# Work with training set
trainingSet['start_dayOfWeek'] = trainingSet.start_date.dt.dayofweek
trainingSet['start_week'] = trainingSet.start_date.dt.week
trainingSet['start_quarter'] = trainingSet.start_date.dt.quarter
trainingSet['start_time'] = trainingSet.start_date.dt.time
trainingSet['start_hour'] = trainingSet.start_date.dt.hour
trainingSet['start_minute'] = trainingSet.start_date.dt.minute
trainingSet['start_year'] = trainingSet.start_date.dt.year
trainingSet['start_month'] = trainingSet.start_date.dt.month
trainingSet['start_day'] = trainingSet.start_date.dt.day
trainingSet['start_date'] = trainingSet.start_date.dt.date

trainingSet['end_dayOfWeek'] = trainingSet.end_date.dt.dayofweek
trainingSet['end_week'] = trainingSet.end_date.dt.week
trainingSet['end_quarter'] = trainingSet.end_date.dt.quarter
trainingSet['end_time'] = trainingSet.end_date.dt.time
trainingSet['end_hour'] = trainingSet.end_date.dt.hour
trainingSet['end_minute'] = trainingSet.end_date.dt.minute
trainingSet['end_year'] = trainingSet.end_date.dt.year
trainingSet['end_month'] = trainingSet.end_date.dt.month
trainingSet['end_day'] = trainingSet.end_date.dt.day
trainingSet['end_date'] = trainingSet.end_date.dt.date

trainingSet['year'] = pd.to_datetime(trainingSet['start_date']).dt.year
trainingSet['month'] = pd.to_datetime(trainingSet['start_date']).dt.month
trainingSet['weekday'] = pd.to_datetime(trainingSet['start_date']).dt.weekday

# Work with testing set
testingSet['start_dayOfWeek'] = testingSet.start_date.dt.dayofweek
testingSet['start_week'] = testingSet.start_date.dt.week
testingSet['start_quarter'] = testingSet.start_date.dt.quarter
testingSet['start_time'] = testingSet.start_date.dt.time
testingSet['start_hour'] = testingSet.start_date.dt.hour
testingSet['start_minute'] = testingSet.start_date.dt.minute
testingSet['start_year'] = testingSet.start_date.dt.year
testingSet['start_month'] = testingSet.start_date.dt.month
testingSet['start_day'] = testingSet.start_date.dt.day
testingSet['start_date'] = testingSet.start_date.dt.date

testingSet['end_dayOfWeek'] = testingSet.end_date.dt.dayofweek
testingSet['end_week'] = testingSet.end_date.dt.week
testingSet['end_quarter'] =testingSet.end_date.dt.quarter
testingSet['end_time'] = testingSet.end_date.dt.time
testingSet['end_hour'] = testingSet.end_date.dt.hour
testingSet['end_minute'] = testingSet.end_date.dt.minute
testingSet['end_year'] = testingSet.end_date.dt.year
testingSet['end_month'] = testingSet.end_date.dt.month
testingSet['end_day'] = testingSet.end_date.dt.day
testingSet['end_date'] = testingSet.end_date.dt.date

testingSet['year'] = pd.to_datetime(testingSet['start_date']).dt.year
testingSet['month'] = pd.to_datetime(testingSet['start_date']).dt.month
testingSet['weekday'] = pd.to_datetime(testingSet['start_date']).dt.weekday

*****Converting necessary data to dateTime*****


In [19]:
print "trainingSet cols values", list(trainingSet.columns.values)

trainingSet cols values ['id', 'duration', 'start_date', 'start_station_name', 'start_station_id', 'end_date', 'end_station_name', 'end_station_id', 'bike_id', 'subscription_type', 'zip_code', 'distance', 'start_dayOfWeek', 'start_week', 'start_quarter', 'start_time', 'start_hour', 'start_minute', 'start_year', 'start_month', 'start_day', 'end_dayOfWeek', 'end_week', 'end_quarter', 'end_time', 'end_hour', 'end_minute', 'end_year', 'end_month', 'end_day', 'year', 'month', 'weekday']


In [20]:
print "testingSet cols values", list(testingSet.columns.values)

testingSet cols values ['id', 'start_date', 'start_station_name', 'start_station_id', 'end_date', 'end_station_name', 'end_station_id', 'bike_id', 'subscription_type', 'zip_code', 'distance', 'start_dayOfWeek', 'start_week', 'start_quarter', 'start_time', 'start_hour', 'start_minute', 'start_year', 'start_month', 'start_day', 'end_dayOfWeek', 'end_week', 'end_quarter', 'end_time', 'end_hour', 'end_minute', 'end_year', 'end_month', 'end_day', 'year', 'month', 'weekday']


## Feature Historico

In [21]:
print "*****Working on historic feature*****"
print "***Calculating historic feature***"
import math
listaStart = []
listaEnd = []
for i in list(trainingSet.start_station_id.values):
    if i not in listaStart:
        listaStart.append(i)
for i in list(trainingSet.end_station_id.values):
    if i not in listaEnd:
        listaEnd.append(i)
listaHistorico = []
for i in listaStart:
    for j in listaEnd:
        df = trainingSet[(trainingSet['start_station_id'] == i) & (trainingSet['end_station_id'] == j)]
        historico = df.duration.mean()
        if (not(math.isnan(historico))):
            listaHistorico.append([i,j,historico])
        
listaHistorico

*****Working on historic feature*****
***Calculating historic feature***


[[50, 60, 1254.993877147936],
 [50, 46, 2679.90099009901],
 [50, 62, 2177.8607350096713],
 [50, 55, 502.29606625258799],
 [50, 39, 1938.610909090909],
 [50, 70, 903.78094462540719],
 [50, 61, 662.55546241967033],
 [50, 75, 955.34452296819791],
 [50, 67, 2432.1732026143791],
 [50, 72, 7406.0220588235297],
 [50, 51, 1104.3922651933701],
 [50, 76, 1278.5260663507108],
 [50, 50, 6343.4900849858359],
 [50, 66, 3062.5027027027027],
 [50, 54, 788.76531671858777],
 [50, 64, 842.403162055336],
 [50, 65, 1070.6219047619047],
 [50, 45, 846.55012224938878],
 [50, 77, 834.04506699147385],
 [50, 82, 1168.3443396226414],
 [50, 74, 4673.119565217391],
 [50, 68, 968.3078101071975],
 [50, 71, 2404.4898477157362],
 [50, 56, 2053.1680000000001],
 [50, 49, 453.73065015479875],
 [50, 57, 925.07014028056108],
 [50, 63, 661.69466882067854],
 [50, 48, 1801.127619047619],
 [50, 69, 1076.7346760070052],
 [50, 47, 1400.2985074626865],
 [50, 42, 1246.1496062992126],
 [50, 41, 1677.0087336244542],
 [50, 73, 2730.59

In [22]:
starStationId = []
endStationId = []
historical = []
for x in listaHistorico:
    starStationId.append(x[0])
    endStationId.append(x[1])
    historical.append(x[2])

data = {
    'start_station_id' : starStationId,
    'end_station_id' : endStationId,
    'historical' : historical,
}

dfData = pd.DataFrame(data,columns = ['start_station_id','end_station_id','historical'])
dfData

Unnamed: 0,start_station_id,end_station_id,historical
0,50,60,1254.993877
1,50,46,2679.900990
2,50,62,2177.860735
3,50,55,502.296066
4,50,39,1938.610909
5,50,70,903.780945
6,50,61,662.555462
7,50,75,955.344523
8,50,67,2432.173203
9,50,72,7406.022059


In [23]:
print "**Merging historic feature**"
# Merge this new data to training and testing dfs
# Training
trainingSet = pd.merge(trainingSet,dfData,on=['start_station_id', 'end_station_id'],how = 'inner') 

trainingSet['historical'] = trainingSet.historical.astype(int)

# Testing
testingSet = pd.merge(testingSet, dfData, on=['start_station_id', 'end_station_id'], how='inner')

testingSet['historical'] = testingSet.historical.astype(int)

# delete auxiliar dataframe
del dfData

**Merging historic feature**


In [24]:
print "trainingSet.shape: ", trainingSet.shape
print "testingSet.shape: ", testingSet.shape

trainingSet.shape:  (549961, 34)
testingSet.shape:  (119990, 33)


The difference in the shapes is due to the duration feature used in the training set, which was used to calculate the historical feature.

### Trabajamos con dfSF_Bay

In [25]:
# Convert necessary data to dateTime
dfSF_Bay['date'] = pd.to_datetime(dfSF_Bay.date)

trainingSet['start_date'] = pd.to_datetime(trainingSet.start_date)
trainingSet['end_date'] = pd.to_datetime(trainingSet.end_date)

testingSet['start_date'] = pd.to_datetime(testingSet.start_date)
testingSet['end_date'] = pd.to_datetime(testingSet.end_date)

In [26]:
print "***Merging dfSF_Bay data***"
# Merge trainingSet with new data

testingSet = pd.merge(testingSet,dfSF_Bay,left_on ='start_date',right_on='date',how = 'inner')
trainingSet = pd.merge(trainingSet,dfSF_Bay,left_on ='start_date',right_on='date',how = 'inner')

***Merging dfSF_Bay data***


In [27]:
testingSet

Unnamed: 0,id,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code,...,Fog-Rain,Normal,Rain,Rain-Thunderstorm,date,business_day,holiday,year_y,month_y,weekday_y
0,504737,2014-10-18,Embarcadero at Sansome,60,2014-10-18,Powell at Post (Union Square),71,426,Customer,77009,...,0,1,0,0,2014-10-18,0,0,2014,10,5
1,505019,2014-10-18,San Francisco Caltrain 2 (330 Townsend),69,2014-10-18,Embarcadero at Folsom,51,376,Subscriber,94705,...,0,1,0,0,2014-10-18,0,0,2014,10,5
2,504972,2014-10-18,Embarcadero at Vallejo,48,2014-10-18,Steuart at Market,74,559,Customer,20740,...,0,1,0,0,2014-10-18,0,0,2014,10,5
3,504943,2014-10-18,Powell Street BART,39,2014-10-18,Market at 10th,67,553,Customer,80211,...,0,1,0,0,2014-10-18,0,0,2014,10,5
4,504779,2014-10-18,Powell at Post (Union Square),71,2014-10-18,San Francisco Caltrain 2 (330 Townsend),69,623,Subscriber,94107,...,0,1,0,0,2014-10-18,0,0,2014,10,5
5,504906,2014-10-18,Embarcadero at Sansome,60,2014-10-18,Embarcadero at Sansome,60,477,Customer,92101,...,0,1,0,0,2014-10-18,0,0,2014,10,5
6,505015,2014-10-18,Embarcadero at Sansome,60,2014-10-18,Embarcadero at Sansome,60,352,Customer,90024,...,0,1,0,0,2014-10-18,0,0,2014,10,5
7,504853,2014-10-18,Harry Bridges Plaza (Ferry Building),50,2014-10-18,Embarcadero at Sansome,60,586,Subscriber,94606,...,0,1,0,0,2014-10-18,0,0,2014,10,5
8,504990,2014-10-18,Harry Bridges Plaza (Ferry Building),50,2014-10-18,Embarcadero at Sansome,60,442,Customer,94022,...,0,1,0,0,2014-10-18,0,0,2014,10,5
9,504800,2014-10-18,Harry Bridges Plaza (Ferry Building),50,2014-10-18,Embarcadero at Sansome,60,362,Customer,94553,...,0,1,0,0,2014-10-18,0,0,2014,10,5


In [28]:
trainingSet

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,Fog-Rain,Normal,Rain,Rain-Thunderstorm,date,business_day,holiday,year_y,month_y,weekday_y
0,907649,396,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
1,907702,287,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,334,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
2,908937,595,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,503,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
3,908953,439,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,538,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
4,909076,872,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,406,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
5,909006,3328,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,400,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
6,908192,767,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,187,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3
7,908938,590,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,557,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
8,909005,3360,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,558,Customer,...,0,1,0,0,2015-08-27,1,0,2015,8,3
9,907428,427,2015-08-27,Harry Bridges Plaza (Ferry Building),50,2015-08-27,Embarcadero at Sansome,60,331,Subscriber,...,0,1,0,0,2015-08-27,1,0,2015,8,3


# Discretizacion y Normalizacion

## Discretizacion

In [31]:
(trainingSet.dtypes)

id                                     int64
duration                               int64
start_date                    datetime64[ns]
start_station_name                    object
start_station_id                       int64
end_date                      datetime64[ns]
end_station_name                      object
end_station_id                         int64
bike_id                                int64
subscription_type                     object
zip_code                              object
distance                             float64
start_dayOfWeek                        int64
start_week                             int64
start_quarter                          int64
start_time                            object
start_hour                             int64
start_minute                           int64
start_year                             int64
start_month                            int64
start_day                              int64
end_dayOfWeek                          int64
end_week  

In [32]:
list(testingSet.columns.values)

['id',
 'start_date',
 'start_station_name',
 'start_station_id',
 'end_date',
 'end_station_name',
 'end_station_id',
 'bike_id',
 'subscription_type',
 'zip_code',
 'distance',
 'start_dayOfWeek',
 'start_week',
 'start_quarter',
 'start_time',
 'start_hour',
 'start_minute',
 'start_year',
 'start_month',
 'start_day',
 'end_dayOfWeek',
 'end_week',
 'end_quarter',
 'end_time',
 'end_hour',
 'end_minute',
 'end_year',
 'end_month',
 'end_day',
 'year_x',
 'month_x',
 'weekday_x',
 'historical',
 'viajes',
 'max_temperature_c',
 'mean_temperature_c',
 'min_temperature_c',
 'max_dew_point_c',
 'mean_dew_point_c',
 'min_dew_point_c',
 'max_humidity',
 'mean_humidity',
 'min_humidity',
 'max_sea_level_pressure_cm',
 'mean_sea_level_pressure_cm',
 'min_sea_level_pressure_cm',
 'max_visibility_km',
 'mean_visibility_km',
 'min_visibility_km',
 'max_wind_Speed_kmh',
 'mean_wind_speed_kmh',
 'max_gust_speed_kmh',
 'precipitation_cm',
 'cloud_cover',
 'wind_dir_degrees',
 'Fog',
 'Fog-Rain',

In [33]:
print "*****Discretizacion y Normalizacion*****"
print "*****Discretizacion*****"

*****Discretizacion y Normalizacion*****
*****Discretizacion*****


In [34]:
def crearLista (listadoCompleto):
    listaReducida = []
    for i in listadoCompleto:
        if i not in listaReducida:
            listaReducida.append(i)
    listaReducida.sort()
    return listaReducida

In [35]:
def discretizar(columna,nombre, df):
    listaReducida = crearLista(columna)
    v = list(range(len(columna)))
    listaCompleta = list(columna)
    for i in listaReducida:
        for j in range(len(listaCompleta)):
            if(listaCompleta[j] == i):
                v[j] = 1
            else:
                v[j] = 0
        df[nombre+str(i)] = v

In [36]:
print "Discretizando start_station_name..."
discretizar(trainingSet.start_station_name,'start ', trainingSet)
discretizar(testingSet.start_station_name,'start ', testingSet)

print "Discretizando end_station_name..."
discretizar(trainingSet.end_station_name,'end ', trainingSet)
discretizar(testingSet.end_station_name,'end ', testingSet)

print "Discretizando start_dayOfWeek..."
discretizar(trainingSet.start_dayOfWeek,'start_dayOfWeek_id', trainingSet)
discretizar(testingSet.start_dayOfWeek,'start_dayOfWeek_id', testingSet)

print "Discretizando end_dayOfWeek..."
discretizar(trainingSet.end_dayOfWeek,'end_dayOfWeek_id', trainingSet)
discretizar(testingSet.end_dayOfWeek,'end_dayOfWeek_id', testingSet)

print "Discretizando subscription_type_..."
discretizar(trainingSet.subscription_type,'subscription_type_', trainingSet)
discretizar(testingSet.subscription_type,'subscription_type_', testingSet)

print "Discretizando start_year..."
discretizar(trainingSet.start_year,'start_year_', trainingSet)
discretizar(testingSet.start_year,'start_year_', testingSet)

print "Discretizando end_year_..."
discretizar(trainingSet.end_year,'end_year_', trainingSet)
discretizar(testingSet.end_year,'end_year_', testingSet)

print "Discretizando start_month..."
discretizar(trainingSet.start_month,'start_month_', trainingSet)
discretizar(testingSet.start_month,'start_month_', testingSet)

print "Discretizando end_month..."
discretizar(trainingSet.end_month,'end_month_', trainingSet)
discretizar(testingSet.end_month,'end_month_', testingSet)

print "Discretizando start_day..."
discretizar(trainingSet.start_day,'start_day_', trainingSet)
discretizar(testingSet.start_day,'start_day_', testingSet)

print "Discretizando end_day..."
discretizar(trainingSet.end_day,'end_day_', trainingSet)
discretizar(testingSet.end_day,'end_day_', testingSet)

print "Discretizando start_quarter..."
discretizar(trainingSet.start_quarter,'start_quarter_', trainingSet)
discretizar(testingSet.start_quarter,'start_quarter_', testingSet)

print "Discretizando end_quarter..."
discretizar(trainingSet.end_quarter,'end_quarter_', trainingSet)
discretizar(testingSet.end_quarter,'end_quarter_', testingSet)

print "Discretizando start_hour..."
discretizar(trainingSet.start_hour,'start_hour_', trainingSet)
discretizar(testingSet.start_hour,'start_hour_', testingSet)

print "Discretizando end_hour..."
discretizar(trainingSet.end_hour,'end_hour', trainingSet)
discretizar(testingSet.end_hour,'end_hour', testingSet)

Discretizando start_station_name...
Discretizando end_station_name...
Discretizando start_dayOfWeek...
Discretizando end_dayOfWeek...
Discretizando subscription_type_...
Discretizando start_year...
Discretizando end_year_...
Discretizando start_month...
Discretizando end_month...
Discretizando start_day...
Discretizando end_day...
Discretizando start_quarter...
Discretizando end_quarter...
Discretizando start_hour...
Discretizando end_hour...


In [37]:
print "Dropping trash columns..."
trainingSet = trainingSet.drop(labels = ['start_date', 
                                         'end_station_name',
                                         'start_station_name',
                                         'end_date',
                                         'subscription_type',
                                         'zip_code',
                                         'start_time',
                                         'end_time',
                                         'start_dayOfWeek',
                                         'end_dayOfWeek',
                                         'start_year',
                                         'end_year',
                                         'start_month',
                                         'end_month',
                                         'start_day',
                                         'end_day',
                                         'start_quarter',
                                         'end_quarter',
                                         'start_hour',
                                         'end_hour'
                                        ],axis = 1)

testingSet = testingSet.drop(labels = ['start_date', 
                                         'end_station_name',
                                         'start_station_name',
                                         'end_date',
                                         'subscription_type',
                                         'zip_code',
                                         'start_time',
                                         'end_time',
                                         'start_dayOfWeek',
                                         'end_dayOfWeek',
                                         'start_year',
                                         'end_year',
                                         'start_month',
                                         'end_month',
                                         'start_day',
                                         'end_day',
                                         'start_quarter',
                                         'end_quarter',
                                         'start_hour',
                                         'end_hour'
                                        ],axis = 1)

Dropping trash columns...


In [38]:
print "trainingSet.shape: ", trainingSet.shape
print "testingSet.shape: ", testingSet.shape

trainingSet.shape:  (549961, 359)
testingSet.shape:  (119990, 358)


## Normalizacion

In [39]:
trainingSet.to_csv('../CSVs/tempTraining.csv')
testingSet.to_csv('../CSVs/tempTesting.csv')

In [None]:
trainingSet = pd.read_csv('../CSVs/tempTraining.csv')
testingSet = pd.read_csv('../CSVs/tempTesting.csv')

In [None]:
durationNormalize = preprocessing.normalize(trainingSet.duration)
trainingSet['duration'] = durationNormalize[0]
maxTemperatureNormalize = preprocessing.normalize(trainingSet.max_temperature_c)
trainingSet['max_temperature_c'] = maxTemperatureNormalize[0]
minTemperatureNormalize= preprocessing.normalize(trainingSet.min_temperature_c)
trainingSet['min_temperature_c'] = minTemperatureNormalize[0]
maxHumidityNormalize = preprocessing.normalize(trainingSet.max_humidity)
trainingSet['max_humidity'] = maxHumidityNormalize[0]
maxSeaLevelPressureNormalize = preprocessing.normalize(trainingSet.max_sea_level_pressure_cm)
trainingSet['max_sea_level_pressure_cm'] = maxSeaLevelPressureNormalize[0]
precipitationNormalize = preprocessing.normalize(trainingSet.precipitation_cm)
trainingSet['precipitation_cm'] = precipitationNormalize[0]