In [0]:
'''Import the required libraries'''
import pandas as pd
import os
import numpy as np
import seaborn as sns
from google.cloud import bigquery
import datetime

In [0]:
'''Authenticate Colab'''
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

# Data Fetching from Big Query




In [0]:
'''Query to fetch sample of data 10% from 100 million records due to RAM constraints: Used Stratified Sampling across pick_community_area such that the sample has same propotion of pickup area as of population'''

project_id = 'tidy-ivy-242507'
client = bigquery.Client(project_id)

query_subsample = """SELECT *
FROM (
  SELECT *  
  FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` a 
   JOIN (
  SELECT *, SUM(c) OVER() total
  FROM (
    SELECT pickup_community_area as pickup_community_area_new , EXTRACT(MONTH FROM trip_start_timestamp) as month_, EXTRACT(YEAR FROM trip_start_timestamp) as year_, COUNT(*) c 
    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` 
    GROUP BY 1,2,3)) b on a.pickup_community_area = b.pickup_community_area_new and EXTRACT(MONTH FROM a.trip_start_timestamp) = b.month_ and EXTRACT(YEAR FROM a.trip_start_timestamp) = b.year_
  WHERE RAND()< 10000000/total
) """


data_sample = client.query(query_subsample).to_dataframe()

In [0]:
'''To get Access to Drive for reading and writing files if needed'''
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
'''Import from Drive the sample that is saved before'''
data_sample = pd.read_csv('/content/gdrive/My Drive/complete_sample_10million.csv')


#Data Exploration and  Data Cleaning

In [0]:
'''1) Tried to impute missing values of trip_miles with the help of trip_total and vice-versa but there is not much co-relation so skipped
  2) Tried to impute drop_offlocation - > trip_miles + Trip_total + pickup_location using predictive approach but performance is decreasing so not used for final model ''' 
'''
data_sample_dummy = data_sample[(~data_sample.trip_miles.isnull()) & (data_sample.trip_total.isnull())]
print(data_sample_dummy.info())
impute_trip_total = data_sample.groupby(['dropoff_location','pickup_location'],as_index=False)['trip_total'].mean()
impute_trip_total.columns = ['dropoff_location','pickup_location','trip_total_impute']
print(impute_trip_total.head())
print(impute_trip_total.info())
impute_trip_miles = data_sample.groupby(['dropoff_location','pickup_location'],as_index=False)['trip_miles'].mean()
impute_trip_miles.columns = ['dropoff_location','pickup_location','trip_miles_impute']
print(impute_trip_miles.head())
print(impute_trip_miles.info())
data_sample_new = pd.merge(data_sample,impute_trip_miles,on=['dropoff_location','pickup_location'])
data_sample_new['trip_miles'] = np.where(data_sample_new['trip_miles']==None,data_sample_new['trip_miles_impute'],data_sample_new['trip_miles'])
print(data_sample_new.isnull().sum())
#impute_trip_total
data_sample_new = pd.merge(data_sample,impute_trip_total,on=['dropoff_location','pickup_location'])
data_sample_new['trip_total_2'] = np.where(data_sample_new['trip_total']==None,data_sample_new['trip_total_impute'],data_sample_new['trip_total'])
print(data_sample_new.isnull().sum()
#now handling the missing values of drop_off_location
#drop_offlocation - > trip_miles + Trip_total + pickup_location
print(len(data_sample.dropoff_location.unique()))
'''

#### Removed nulls for taxi id and trip seconds 
#### Removed outliers wrt trip seconds 
#### Data set consisting of only 2013-2016 data points

In [0]:
#print(len(data_sample['taxi_id'].unique()))
#print(data_sample['taxi_id'].isnull().any())
#print(data_sample[(data_sample['taxi_id'].notnull())].info())
#As only 635 nulls are there in data with respect to taxi_id , we Remove those rows
def pre_process_data(data_sample):
  data_sample = data_sample[~data_sample.taxi_id.isnull()]
  print(data_sample['trip_end_timestamp'].isnull().any())
  data_sample = data_sample[~data_sample.trip_end_timestamp.isnull()]
  print(data_sample['trip_end_timestamp'].isnull().any())
  data_sample = data_sample[~data_sample.trip_seconds.isnull()]   #target value is needed for building model, so we eliminate rows where null value is present
  data_sample = data_sample[(data_sample.pickup_location != data_sample.dropoff_location)]
  sns.boxplot(data_sample['trip_seconds'].values,showfliers = False)
  
  #filtered outliers with respect to trip_seconds based on percentile
  data_sample = data_sample[(data_sample.trip_seconds <= np.percentile(data_sample.trip_seconds, 99.7))&(data_sample.trip_seconds > np.percentile(data_sample.trip_seconds, 5))]
  
  #Selecting only data from 2013 , 2014,2015,2016 for training
  data_sample["Datetime_start"] = pd.to_datetime(data_sample['trip_start_timestamp'],format = '%Y-%m-%d %H:%M:%S')
  
  data_sample_train = data_sample[(data_sample['Datetime_start'] < datetime.datetime(2017, 1,1,0,0,0 ))]
  return data_sample_train
  
data_sample= pre_process_data(data_sample)

#print(data_sample_train.size)



#### Preview to check  data type of different attributes

In [0]:
continous_features = []
high_cardinality_features=[]
noisy_features = []
categorical_features=[]
def get_type_of_variables(products):
    for y in products.columns[1:]:
        if (products[y].dtype == np.float64):            
            continous_features.append(y)
        
        elif (products[y].dtype == np.int64):
            if len(products[y].unique()) > 10000 and len(products[y].unique()) < 30000 :
                high_cardinality_features.append(y)
            elif len(products[y].unique()) >  30000:
                noisy_features.append(y)
            else:
                categorical_features.append(y)
        else:
            if len(products[y].unique()) > 10000 and len(products[y].unique()) < 30000 :
                high_cardinality_features.append(y)
            elif len(products[y].unique()) >  30000:
                noisy_features.append(y)
            else:
                categorical_features.append(y)
        print(y)
        print(len(data_sample[y].unique()))
get_type_of_variables(data_sample)
#We could see only 'fare', 'tips', 'tolls', 'extras', 'trip_total', 'trip_seconds', 'trip_miles' are continous values

#### Trying to check if direct mapping of cordinates is possible with census_tract for missing values


In [0]:
'''
#data_sample['check_condition'] = np.where(data_sample['dropoff_census_tract'] == None,np.where(data_sample['dropoff_community_area']==None,0,1),0)
#print(sum(list(data_sample['check_condition'])))
#data_sample_dummy = data_sample[~data_sample.dropoff_census_tract.isnull()]
#print(data_sample_dummy['dropoff_census_tract'].isnull().sum())
#del data_sample_dummy
data_sample_dummy = data_sample[(~data_sample.dropoff_census_tract.isnull()) & (data_sample.dropoff_longitude.isnull())]
#print(data_sample_dummy.isnull().sum())
del data_sample_dummy
#So even if we impute by direct mapping of dropoff_census_tract to dropoff_longitude only 4959 can be mapped out of 246166
#So not fruitful
'''

###binning the less frequent values to one group for company column

In [0]:
def binning_values(data_sample):
  company_distribution = data_sample['company'].value_counts(normalize=True).sort_values(ascending=False).reset_index()
  company_distribution.columns = ['company','count']
  print(company_distribution.info())
  company_distribution['cum_sum'] = company_distribution['count'].cumsum()
  company_distribution['cum_sum'].plot(style='.-')
  company_distribution['count'].plot(style='.-')
  #Based on the above plot  we take 13 top companies contributing to demand and rest all are treated as Others category
  company_distribution['bin_company'] = np.where(company_distribution['cum_sum'] < 0.99,company_distribution['company'],'Others') 
  map_company_bins =dict(zip(company_distribution.company,company_distribution.bin_company))

  company_map_df = pd.DataFrame()
  company_map_df['company'] = list(data_sample["company"].unique())
  company_map_df['company_bin'] = company_map_df["company"].map(map_company_bins)
  #company_map_df.to_csv('company_bin_map.csv',index=False)
  data_sample['company_bin'] = data_sample["company"].map(map_company_bins)
  data_sample.drop(['company'],axis=1,inplace=True)
  #Get Relationship between company and taxi_id if any
  
  get_sample_company_not_null = data_sample[~data_sample.company_bin.isnull()]
  map_taxi_id_and_company =dict(zip(get_sample_company_not_null.taxi_id,get_sample_company_not_null.company_bin))
  del get_sample_company_not_null
  data_sample["company_new"] = data_sample["taxi_id"].map(map_taxi_id_and_company)
  data_sample['company_bin'].fillna(data_sample['company_new'],inplace=True)
  data_sample.drop(['company_new'],axis=True,inplace=True)
  #print(data_sample.isnull().sum())
  data_sample['company_bin'].fillna('Other_2',inplace=True)
  data_sample.dropna(inplace=True)
  return data_sample
data_sample = binning_values(data_sample)

In [0]:
company_distribution.to_csv('company_distribution.csv',index=False)

!cp company_distribution.csv /content/gdrive/My\ Drive/

In [0]:
#Imputing Null values
#1) trip_miles can be predicted using pickup_location and dropoff_location 
#Here try for relation among null variables and non null values
#for now removing all null values and imputing some nulls using 'Other_2' category
#print(data_sample.info())

#print(data_sample.info())
#print(data_sample.isnull().sum())

'''Null values deleted now for all the columns'''

### Clustering latitude , longitude Co-ordinates  of Chicago City where pickup and drop off happened

In [0]:
#predict_clusters for Dropoff_location and Pick_up_location
from sklearn.cluster import MiniBatchKMeans
import pickle
cluster = MiniBatchKMeans(n_clusters=15, max_iter=1000, batch_size=100000, verbose=0, compute_labels=True, 
                          random_state=None, tol=0.0, max_no_improvement=10, n_init=3, reassignment_ratio=0.005)
df1 = data_sample[['pickup_longitude', 'pickup_latitude']].as_matrix()
df2 = data_sample[['dropoff_longitude', 'dropoff_latitude']].as_matrix()
features = np.round(np.vstack([df1, df2]), 5)
cluster.fit(features)
data_sample['pickup_cluster_label'] = cluster.predict(data_sample[['pickup_longitude', 'pickup_latitude']])
data_sample['dropoff_cluster_label'] = cluster.predict(data_sample[['dropoff_longitude', 'dropoff_latitude']])

pickle.dump(cluster, open( "cluster.p", "wb" ))
!cp cluster.p /content/gdrive/My\ Drive/

In [0]:
from matplotlib import pyplot as plt
_ = plt.hist(cluster.labels_, bins=50)
plt.show()

#### binning the less frequent values to one group for drop off location column


In [0]:
dropoff_location_distribution = data_sample['dropoff_location'].value_counts(normalize=True).sort_values(ascending=False).reset_index()
dropoff_location_distribution.columns = ['dropoff_location','count']
print(dropoff_location_distribution.info())
dropoff_location_distribution['cum_sum'] = dropoff_location_distribution['count'].cumsum()
dropoff_location_distribution['cum_sum'].plot(style='.-')
dropoff_location_distribution['count'].plot(style='.-')
#dropoff_location_distribution.to_csv('dropoff_location_distribution.csv',index=False)

dropoff_location_distribution['bin_dropoff_location'] = np.where(dropoff_location_distribution['cum_sum'] < 0.95,dropoff_location_distribution['dropoff_location'],'Others')
#print(company_distribution.head(15))
map_dropoff_location_bins =dict(zip(dropoff_location_distribution.dropoff_location,dropoff_location_distribution.bin_dropoff_location))
data_sample['dropoff_location_bin'] = data_sample["dropoff_location"].map(map_dropoff_location_bins)
data_sample.drop(['dropoff_location'],axis=1,inplace=True)

In [0]:
dropoff_location_distribution.to_csv('dropoff_location_distribution.csv',index=False)

In [0]:
!cp dropoff_location_distribution.csv /content/gdrive/My\ Drive/

#### binning the less frequent values to one group for pick up location column

In [0]:
pickup_location_distribution = data_sample['pickup_location'].value_counts(normalize=True).sort_values(ascending=False).reset_index()
pickup_location_distribution.columns = ['pickup_location','count']
print(pickup_location_distribution.info())
#pickup_census_tract_distribution.index = pickup_census_tract_distribution['pickup_census_tract']
#Plot the pickup_census to know the actual distribution so that we can bin non- frequent categories
#print(pickup_census_tract_distribution.head())
pickup_location_distribution['cum_sum'] = pickup_location_distribution['count'].cumsum()
pickup_location_distribution['cum_sum'].plot(style='.-')
pickup_location_distribution['count'].plot(style='.-')
pickup_location_distribution.to_csv('pickup_location_distribution.csv',index=False)
#!cp pickup_location_distribution.csv /content/gdrive/My\ Drive/

In [0]:
pickup_location_distribution['bin_pickup_location'] = np.where(pickup_location_distribution['cum_sum'] < 0.95,pickup_location_distribution['pickup_location'],'Others')
#print(company_distribution.head(15))
map_pickup_location_bins =dict(zip(pickup_location_distribution.pickup_location,pickup_location_distribution.bin_pickup_location))
data_sample['pickup_location_bin'] = data_sample["pickup_location"].map(map_pickup_location_bins)
data_sample.drop(['pickup_location'],axis=1,inplace=True)
#print(data_sample.info())
data_sample = data_sample[['unique_key','taxi_id','pickup_cluster_label','dropoff_cluster_label','company_bin','payment_type','dropoff_location_bin','pickup_location_bin','trip_seconds','trip_miles','fare','tolls','trip_total','trip_start_timestamp','trip_end_timestamp']]


# Pre Processing and Impute Missing Values

In [0]:
data_sample["Datetime_start"] = pd.to_datetime(data_sample['trip_start_timestamp'],format = '%Y-%m-%d %H:%M:%S')
data_sample["Month"] = data_sample.Datetime_start.dt.month
data_sample["Week"] = data_sample.Datetime_start.dt.week
data_sample["DayofWeek"] = data_sample.Datetime_start.dt.dayofweek
data_sample["hour_start"] = data_sample.Datetime_start.dt.hour
data_sample["year"] = data_sample.Datetime_start.dt.year
data_sample["day"] = data_sample.Datetime_start.dt.day
data_sample.drop(['trip_start_timestamp','Datetime_start'],axis=1,inplace=True)

### Imputing Traffic Per Hour across Date , Hour and Drop off Cluster Level


In [0]:
#Predict Traffic Area: 
#1) We first take demand from pickup point hourly time across different dropoff clusters 
#2) calculate pickup point hourly time demand  
#3) divide 1 by 2 to get hourly traffic
#here we are getting demand at hour pickup cluster level wise
cols = ['unique_key', 'Month', 'day','year', 'hour_start', 'dropoff_cluster_label']
grp = data_sample[cols].groupby(['Month', 'day', 'year','hour_start', 
                          'dropoff_cluster_label']).agg('count')
grp.columns = [x+'_count' for x in grp.columns]
grp.reset_index(inplace=True)
print(grp.head())
data_sample_grp = pd.merge(data_sample,grp,on=['Month', 'day', 'year','hour_start', 
                          'dropoff_cluster_label'])

In [0]:

del grp
#Similarly getting demand at hour level
grp2 = data_sample[cols].groupby(['Month', 'day', 'year','hour_start' ]).agg('count')
grp2.columns = [x+'_count_hr' for x in grp2.columns]
grp2.reset_index(inplace=True)
del data_sample
data_sample_grp_new = pd.merge(data_sample_grp,grp2,on=['Month', 'day', 'year','hour_start'])
#hourly traffic demand
data_sample_grp_new['traffic_hr_cluster'] = data_sample_grp_new['unique_key_count']/data_sample_grp_new['unique_key_count_hr']*1.0
print(data_sample_grp_new.head())

In [0]:
data_sample = data_sample_grp_new
del data_sample_grp_new
del data_sample_grp

### Bin Hour data based on demand

In [0]:
import matplotlib.pyplot as plt
trips_by_hour =pd.DataFrame(data_sample.groupby('hour_start').size().reset_index(name = "demand"))
trips_by_hour.plot(x='hour_start', y='demand', kind='bar', figsize=(8,6))
plt.title("Count of Taxi Rides Per Time of Day")
plt.xlabel("Time")
plt.ylabel("Count")
plt.show()

In [0]:
#Group the hour column  based on the demand from the above mentioned plot

def getHour_num(x):    
    if x >= 2 and x<=7:
        y = 1  
    
    elif x>=8 and x<=11:
        y = 2
    
    elif x>=12 and x<= 16:
        y = 3 
    
    elif x>=17 and x<=20:
        y = 4
    
    elif x>=20 and x<=23:
        y = 5
      
    elif x>=0 and x<=1:
        y = 0
    
    return y
data_sample["hour_bin"] = data_sample["hour_start"].apply((lambda x : getHour_num(x)))
#data_sample.drop(['hour_start'],axis=1,inplace=True)
print(data_sample.info())

In [0]:
'''To know trip_seconds and trip duration distributions'''
hours = np.array(list(data_sample.hour_start))
sns.set(font_scale = 1.5)
plt.figure(figsize=(15,6))
sns.boxplot(hours,data_sample['trip_seconds'].values/3600,showfliers = False)
plt.title('Distribution of trip duration depending on hour of the day')
plt.ylabel('Trip duration in hours')
plt.xlabel('Hour of day from 0:00 to 23:00')

plt.figure(figsize=(15,6))
sns.boxplot(hours,data_sample['trip_miles'],showfliers = False)
plt.title('Distribution of trip distance depending on hour of the day')
plt.ylabel('Trip duration in hours')
plt.xlabel('Hour of day from 0:00 to 23:00')



###Using Weather Conditions from Kaggle dataset as features 

In [0]:
'''Getting weather conditions hourly from this link https://www.kaggle.com/selfishgene/historical-hourly-weather-data#humidity.csv'''
humidity = pd.read_csv('humidity.csv')
pressure = pd.read_csv('pressure.csv')
temperature = pd.read_csv('temperature.csv')
wind_direction = pd.read_csv('wind_direction.csv')
wind_speed = pd.read_csv('wind_speed.csv')


#### Imputing Missing Values of weather condition using exponential weighted average around 12 hours

In [0]:

'''Imputing Missing Values of weather condition using exponential weighted average around 12 hours'''
humidity_ts = humidity[['datetime','Chicago']]
humidity_ts.columns = ['datetime','humidity']
humidity_ts['exp_humidity'] =humidity_ts['humidity'].ewm(span=12).mean()

humidity_ts['humidity_imp'] = np.where(humidity_ts['humidity'].isnull(),humidity_ts['exp_humidity'],humidity_ts['humidity'])
humidity_ts.drop(['humidity','exp_humidity'],axis=1,inplace=True)

pressure_ts = pressure[['datetime','Chicago']]
pressure_ts.columns = ['datetime','pressure']
pressure_ts['exp_pressure'] =pressure_ts['pressure'].ewm(span=12).mean()

pressure_ts['pressure_imp'] = np.where(pressure_ts['pressure'].isnull(),pressure_ts['exp_pressure'],pressure_ts['pressure'])
pressure_ts.drop(['pressure','exp_pressure'],axis=1,inplace=True)


temperature_ts = temperature[['datetime','Chicago']]
temperature_ts.columns = ['datetime','temperature']
temperature_ts['exp_temperature'] =temperature_ts['temperature'].ewm(span=12).mean()

temperature_ts['temperature_imp'] = np.where(temperature_ts['temperature'].isnull(),temperature_ts['exp_temperature'],temperature_ts['temperature'])
temperature_ts.drop(['temperature','exp_temperature'],axis=1,inplace=True)

wind_direction_ts = wind_direction[['datetime','Chicago']]
wind_direction_ts.columns = ['datetime','wind_direction']
wind_direction_ts['exp_wind_direction'] =wind_direction_ts['wind_direction'].ewm(span=12).mean()

wind_direction_ts['wind_direction_imp'] = np.where(wind_direction_ts['wind_direction'].isnull(),wind_direction_ts['exp_wind_direction'],wind_direction_ts['wind_direction'])

wind_direction_ts.drop(['wind_direction','exp_wind_direction'],axis=1,inplace=True)

wind_speed_ts = wind_speed[['datetime','Chicago']]
wind_speed_ts.columns = ['datetime','wind_speed']
wind_speed_ts['exp_wind_speed'] =wind_speed_ts['wind_speed'].ewm(span=12).mean()

wind_speed_ts['wind_speed_imp'] = np.where(wind_speed_ts['wind_speed'].isnull(),wind_speed_ts['exp_wind_speed'],wind_speed_ts['wind_speed'])

wind_speed_ts.drop(['wind_speed','exp_wind_speed'],axis=1,inplace=True)


print(wind_speed_ts.head())
print(data_sample.head())

#trying to fill missing values using weighted average


In [0]:
'''Merge all weather columns in a single dataframe '''
file_1 = pd.merge(humidity_ts,pressure_ts,on=['datetime'])
file_2 = pd.merge(file_1,temperature_ts,on=['datetime'])
file_3 = pd.merge(file_2,wind_direction_ts,on=['datetime'])
file_weather = pd.merge(file_3,wind_speed_ts,on=['datetime'])
del file_1
del file_2
del file_3
del humidity_ts
del pressure_ts
del temperature_ts
del wind_direction_ts
del wind_speed_ts

In [0]:
'''Convert timestamp to month , hour , day , year to join with our parent dataset'''
def convert(data,data_sample):
  data["Datetime_start"] = pd.to_datetime(data['datetime'],format = '%Y-%m-%d %H:%M:%S')
  data["Month"] = data.Datetime_start.dt.month
  data["hour_start"] = data.Datetime_start.dt.hour
  data["year"] = data.Datetime_start.dt.year
  data["day"] = data.Datetime_start.dt.day
  #data.drop(['Datetime_start'],axis=1,inplace=True)
  data_sample_new = pd.merge(data_sample,data,on=['Month','year','day','hour_start'])
  del data_sample
  del data
  return data_sample_new
data_sample_new = convert(file_weather,data_sample)
data_sample_new.sort_values(by= 'Datetime_start',inplace=True)
data_sample_new.drop(['Datetime_start'],axis=1,inplace=True)
del data_sample
del file_weather

In [0]:
#Drop off location insights
plt.figure(figsize=(15,6))
sns.boxplot(np.array(list(data_sample_new['dropoff_location_bin'])),data_sample_new['trip_seconds'].values/3600,showfliers = False)
plt.title('Distribution of trip seconds depending on drop off location')
plt.ylabel('Trip duration in hours')
plt.xlabel('dropoff_location_bin')

plt.figure(figsize=(15,6))
sns.boxplot(np.array(list(data_sample_new['pickup_location_bin'])),data_sample_new['trip_seconds'].values/3600,showfliers = False)
plt.title('Distribution of trip seconds depending on drop off location')
plt.ylabel('Trip duration in hours')
plt.xlabel('dropoff_location_bin')

# Feature Engineering


In [0]:
 
data_sample_new.drop(['day','year','hour_start','trip_end_timestamp','payment_type','datetime'],axis=1,inplace=True)

#So we take only these columns to predict our trip_seconds target variable
data = data_sample_new[['taxi_id','company_bin','pickup_cluster_label','dropoff_cluster_label',
       'dropoff_location_bin', 'pickup_location_bin', 'trip_seconds',
       'trip_miles', 'fare', 'tolls', 'trip_total', 
       'Month', 'Week', 'DayofWeek', 
       'hour_bin','traffic_hr_cluster', 'humidity_imp', 'pressure_imp',
       'temperature_imp', 'wind_direction_imp', 'wind_speed_imp']]
 
del data_sample_new


In [0]:


#Clear Memory
del dropoff_location_distribution
del map_company_bins
del humidity
del map_dropoff_location_bins
del map_pickup_location_bins	 
del map_taxi_id_and_company
del pickup_location_distribution	 
del pressure
del temperature
del wind_direction
del wind_speed
del categorical_features	
del company_distribution	
del continous_features
del high_cardinality_features	 
del get_type_of_variables	 
del getHour_num
%who


#### Encode categorical variables and then convert into one hot encoding for feeding to the model



In [0]:
import pickle
from sklearn.preprocessing import OneHotEncoder,StandardScaler, LabelEncoder

'''Encode categorical variables and then convert into one hot encoding for feeding to the model '''
def ohe_features(data): 
    columns = ['company_bin', 'pickup_cluster_label','dropoff_cluster_label','dropoff_location_bin', 'pickup_location_bin',
       'Month', 'Week', 'DayofWeek', 'hour_bin']
    for f in columns :
        #print(f)
        enc = LabelEncoder()
        data[f]=enc.fit_transform(data[f])
        pickle.dump(enc,open('enc_'+str(f)+'.h',"wb"))
    
    df_class_num =[12,15,15,80,59,12,53,7,6]
    ohe = OneHotEncoder(n_values = df_class_num, dtype=np.int32, sparse=False)
    
    X_train_1 = ohe.fit_transform(data[columns].values)
    pickle.dump(ohe,open('ohe_trip_cluster_traffic.h',"wb"))
    del data
    return X_train_1

### Scale the continous columns and join with the categorical variables to feed to the model'


In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib

'''Scale the continous columns and join with the categorical variables to feed to the model'''
def final_process(processed,data_processed):
    scaler = StandardScaler(with_mean=True)
    continous_columns = ['trip_miles','traffic_hr_cluster', 'tolls', 'trip_total','humidity_imp', 'pressure_imp', 'temperature_imp']
    #data_processed[continous_columns] = data_processed[continous_columns].fillna(0)
    
    scaled_columns = scaler.fit_transform(data_processed[continous_columns])
    scaler_filename = "scaler_trip_cluster_traffic.save"
    joblib.dump(scaler, scaler_filename) 
    
    target_variable = data_processed['trip_seconds']
    del data_processed
    del scaler
    del continous_columns
    
    processed_data_new = np.concatenate([processed, scaled_columns], axis=1)
    del processed    
    #del processed_data_this
    del scaled_columns
    return processed_data_new, target_variable

In [0]:
del bigquery
del noisy_features

In [0]:
#Run the above methods and return the training data set and training target column'''
def final_data_to_model(data_processed):
    processed = ohe_features(data_processed)          
    processed_data_new, target_variable = final_process(processed,data_processed)
    del processed
    del data_processed
    return processed_data_new, target_variable
data1, target1 = final_data_to_model(data)

In [0]:
'''The total number of rows for training were around 5 million after removing outliers and null values of target and cordinates columns from 8 million'''
data_train = data1[0 :3800000]
data_validation = data1[3800000:]
target_train = target1[0:3800000]
target_validation = target1[3800000:]
del data1


### Neural Network Model using Keras

In [0]:
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers.normalization import BatchNormalization


model = Sequential()
'''Input layer used Relu as activation functions in inout and hidden layers and used he_normal weights initialization'''
model.add(Dense(130, input_dim=266, kernel_initializer='he_normal', activation='relu'))
'''Hidden Layer we did batch normalization for better training convergence and dropout for generalization'''
model.add(Dense(65,kernel_regularizer=regularizers.l2(0.01)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(1, kernel_initializer='he_normal',kernel_regularizer=regularizers.l2(0.01)))

print('fitting model')
'''Used ADAM optimizer for better weight updations '''
model.compile(loss='mean_absolute_percentage_error', optimizer='adam')
model.fit(data_train, target_train, epochs=50,batch_size=2048,verbose=1,shuffle=True,)

In [0]:
#Save Model for prediction
model.save('trip_output_cluster_traffic.h5')
model.save_weights('trip_weights_new_cluster_traffic.h5')

### Compute MAPE

In [0]:
#Predict Validation dataset 
predicted_validation =  model.predict(data_validation)

predicted = predicted_validation
predicted_new = []
for value in predicted:
    predicted_new.append(value[0])

'''Compute MAPE'''
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
result = mean_absolute_percentage_error(target_validation, predicted_new)
print(result)
