# Exploring Dataset and Feature Engineering

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, chi2, f_regression
import zipfile
from pyproj import Proj, transform
import geopandas as gpd
import shapefile
# a nice way of filtering out deprecated warnings
import warnings
import geopandas as gpd
warnings.filterwarnings("ignore")

## Open Taxi and FHV Datasets

In [120]:
df1 = pd.DataFrame({'a': [1,2,3,4], 'b': [1,2, 3, np.nan], 'e': ['a', 1, 2,'b']})
df2 = pd.DataFrame({'a': [1,2,3,4], 'b': [np.nan, 4, 5, 6]})
df1

Unnamed: 0,a,b,e
0,1,1.0,a
1,2,2.0,1
2,3,3.0,2
3,4,,b


In [121]:
df2

Unnamed: 0,a,b
0,1,
1,2,4.0
2,3,5.0
3,4,6.0


In [135]:
df1.combine_first(df1[['a', 'e']].merge(df2))
df1

Unnamed: 0,a,b,e
0,1,1.0,a
1,2,4.0,1
2,3,5.0,2
3,4,6.0,b


In [94]:
df_taxi= pd.read_feather("../preprocessed_data/feather/yellow_tripdata_2019.feather")
print("df_taxi read")
df_fhv= pd.read_feather("../preprocessed_data/feather/fhv_tripdata_2019.feather")
print("df_fhv read")

df_taxi read
df_fhv read


Since feather file doesn't support datetime datatype, convert it again into datetime

In [3]:
df_taxi['pickup_datetime']= pd.to_datetime(df_taxi['pickup_datetime'])
print("converted pickup to ", type(df_taxi['pickup_datetime'][0]))
df_fhv['pickup_datetime']= pd.to_datetime(df_fhv['pickup_datetime'])
print("converted dropoff to ", type(df_taxi['dropoff_datetime'][0]))

converted pickup to  <class 'pandas._libs.tslibs.timestamps.Timestamp'>
converted dropoff to  <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [5]:
train_taxi= df_taxi.loc[df_taxi['pickup_datetime']< pd.Timestamp(datetime(2019, 5, 1))]
print("train_taxi made")
train_fhv= df_fhv.loc[df_fhv['pickup_datetime']< pd.Timestamp(datetime(2019, 5, 1))]
print("train_fhv made")
test_taxi= df_taxi.loc[df_taxi['pickup_datetime']> pd.Timestamp(datetime(2019, 4, 30))]
print("test_taxi made")
test_fhv= df_fhv.loc[df_fhv['pickup_datetime']> pd.Timestamp(datetime(2019, 4, 30))]
print("test_fhv made")

train_taxi made
train_fhv made
test_taxi made
test_fhv made


In [9]:
del df_taxi
del df_fhv
print("original taxi and fhv df deleted from memory")

original df deleted from memory


In [7]:
train_df = train_taxi.append(train_fhv, ignore_index = True)
del train_taxi
del train_fhv
print("train_df made and train_taxi and train_fhv deleted")
test_df = test_taxi.append(test_fhv, ignore_index = True)
del test_taxi
del test_fhv
print("test_df made and test_taxi and test_fhv deleted")

train_df made and train_taxi and train_fhv deleted
test_df made and test_taxi and test_fhv deleted


In [8]:
# open location zone information
dfzone = pd.read_csv("../raw_data/taxi+_zone_lookup.csv")
# open location shapefile
with zipfile.ZipFile(open(r'../data/large/taxi_zones.zip', 'rb')) as zip_ref:
    zip_ref.extractall('../data/large/')
sf = gpd.read_file("../data/large/taxi_zones.shp")

## Save data to df

In [14]:
print("Null values in training df:")
print(train_df.isnull().sum())
print("Null values in testing df:")
print(test_df.isnull().sum())

Null values in training df:
index                           0
pickup_datetime                 0
dropoff_datetime                0
passenger_count          45469918
trip_distance            45469918
PULocationID                    0
DOLocationID                    0
fare_amount              45469918
extra                    45469918
mta_tax                  45469918
tip_amount               45469918
tolls_amount             45469918
improvement_surcharge    45469918
total_amount             45469918
congestion_surcharge     45469918
total_trip_duration             0
avespeed_mileshr         45469918
time session                    0
date                            0
hour                            0
dtype: int64
Null values in testing df:
index                           0
pickup_datetime                 0
dropoff_datetime                0
passenger_count          22151715
trip_distance            22151715
PULocationID                    0
DOLocationID                    0
fare_amount   

In [24]:
del train_df["index"]
del test_df["index"]

In [29]:
train_df.columns, test_df.columns

(Index(['pickup_datetime', 'dropoff_datetime', 'passenger_count',
        'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'extra',
        'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
        'total_amount', 'congestion_surcharge', 'total_trip_duration',
        'avespeed_mileshr', 'time session', 'date', 'hour'],
       dtype='object'),
 Index(['pickup_datetime', 'dropoff_datetime', 'passenger_count',
        'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'extra',
        'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
        'total_amount', 'congestion_surcharge', 'total_trip_duration',
        'avespeed_mileshr', 'time session', 'date', 'hour'],
       dtype='object'))

In [67]:
#train_df.isnull().sum()

pickup_datetime                 0
dropoff_datetime                0
passenger_count          45469918
trip_distance            45469918
PULocationID                    0
DOLocationID                    0
fare_amount              45469918
extra                    45469918
mta_tax                  45469918
tip_amount               45469918
tolls_amount             45469918
improvement_surcharge    45469918
total_amount             45469918
congestion_surcharge     45469918
total_trip_duration             0
avespeed_mileshr         45469918
time session                    0
date                            0
hour                            0
dtype: int64

In [199]:
#median of everything
X_train= train_df.groupby(['date', 'time session','PULocationID'], as_index= False)['passenger_count'].median()
X_train_2= train_df.dropna().groupby(['date', 'time session','PULocationID'], as_index= False).median()
X_train= pd.merge(X_train, X_train_2, on=['date', 'time session', 'PULocationID'] )
del X_train_2
X_train.columns



Index(['date', 'time session', 'PULocationID', 'passenger_count_x',
       'passenger_count_y', 'trip_distance', 'DOLocationID', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'congestion_surcharge',
       'total_trip_duration', 'avespeed_mileshr', 'hour'],
      dtype='object')

In [196]:
df_taxi["PULocationID"].nunique()

252

In [197]:
X_train["PULocationID"].nunique()

244

In [167]:
X_train_2= train_df.dropna()

KeyboardInterrupt: 

In [None]:
X_train_2= X_train_2.groupby(['date', 'time session','PULocationID'], as_index= False).median().dropna()
X_train_2.head(10)

In [None]:
X_train_2.combine_first(df1[['a', 'e']].merge(df2))

In [93]:
X_train.isnull().sum()

date                         0
time session                 0
PULocationID                 0
DOLocationID             34795
total_trip_duration      34795
hour                     34795
passenger_count          34795
trip_distance            34795
fare_amount              34795
extra                    34795
mta_tax                  34795
tolls_amount             34795
improvement_surcharge    34795
total_amount             34795
congestion_surcharge     34795
avespeed_mileshr         34795
dtype: int64

In [84]:
X_train.loc[X_train['DOLocationID'].isnull()]

Unnamed: 0,date,time session,PULocationID,DOLocationID,total_trip_duration,hour,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,avespeed_mileshr
0,2019-03-01,1,1,,,,,,,,,,,,,
3,2019-03-01,1,5,,,,,,,,,,,,,
4,2019-03-01,1,6,,,,,,,,,,,,,
6,2019-03-01,1,8,,,,,,,,,,,,,
7,2019-03-01,1,9,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59525,2019-04-30,4,252,,,,,,,,,,,,,
59526,2019-04-30,4,254,,,,,,,,,,,,,
59529,2019-04-30,4,257,,,,,,,,,,,,,
59530,2019-04-30,4,258,,,,,,,,,,,,,


In [179]:
ex= df_taxi.loc[train_df['date']==train_df.iloc[0,17]]
ex= ex.loc[ex['time session']==1 ]
#ex= ex.loc[ex['PULocationID']==1]
print(ex['PULocationID'].nunique)
del ex

<bound method IndexOpsMixin.nunique of 16747       246
16876         7
17424       186
17435       186
17476       262
           ... 
14011961    158
14011962     79
14016339     88
14016340    148
14016561     43
Name: PULocationID, Length: 353880, dtype: int16>


In [174]:
#Upon further inspection,
#it seems as if the Null values are values with only fhv values
#Hence, remove it
train_df.shape

(58652320, 19)

In [None]:
X_test= test_df.groupby(['date', 'time session','PULocationID'], as_index= False)['passenger_count'].sum()
y_test= X_test[["date","time session", "PULocationID", "DOLocationID", "total_trip_duration", "hour"]]

print("X_test PARTIALLY created")
X_train_2= train_df.dropna().groupby(['date', 'time session','PULocationID'], as_index= False).median()
X_train_2= X_train_2[["passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tolls_amount",
                     "improvement_surcharge","total_amount","congestion_surcharge","avespeed_mileshr"]]
X_train= pd.merge(X_train, X_train_2, on=['date', 'time session', 'PULocationID'] )
del X_train_2
print("X_train FULLY created")
print(X_train.head())
print(X_train.isnull().sum())


In [None]:
del train_df

In [None]:
X_test= test_df.groupby(['date', 'time session','PULocationID'], as_index= False).median()
X_test= X_test[["date","time session", "PULocationID", "pickup_datetime","dropoff_datetime",
                 "PULocationID","DOLocationID", "total_trip_duration", "hour"]]
print("X_test PARTIALLY created")
X_test_2= test_df.groupby(['date', 'time session','PULocationID'], as_index= False, dropna=True).median()
X_test_2= X_test_2[["passenger_count","trip_distance","fare_amount", "extra","mta_tax", "tolls_amount",
                     "improvement_surcharge","total_amount","congestion_surcharge","avespeed_mileshr"]]
X_test= X_test.join(X_test_2, lsuffix='_median', rsuffix='_drop_na_median')
del X_test_2
print("X_test FULLY created")
print(X_test.head())
print(X_test.isnull().sum())

In [None]:
y_test= test_df.groupby(['date', 'time session','PULocationID'], as_index= False)['passenger_count'].sum()

In [None]:
del test_df

## Feature Selection

In [None]:
bestfeatures = SelectKBest(score_func=f_regression, k=5).fit_transform(X_train,y_train)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
print(featureScores.nlargest(10,'Score'))  #print 10 best features

## Finding the most popular location zones from both Datasets
First, we have to inspect the geographic data. We are dealing with dfzone, df_taxi and df_fhv as numeric data using .describe() only to find if the range of locations match with the classification data given

In [None]:
dfzone.describe()

In [None]:
geo_data= ['PULocationID', 'DOLocationID']
df_taxi[geo_data].describe().round()

In [None]:
df_fhv[geo_data].describe().round()

Let's graph the bargraphs of these data

In [None]:
sns.barplot(df_taxi['PULocationID'].sort_values().value_counts().index, df_taxi['PULocationID'].value_counts())
plt.title('Taxi Dataset PULocationID')
plt.show()
sns.barplot(df_fhv['PULocationID'].sort_values().value_counts().index, df_fhv['PULocationID'].value_counts())
plt.title('FHV Dataset PULocationID')
plt.show()

In order to determine the number of most popular zones to inspect, it is good to visualise the zones against the count of zones for both data again.

In [None]:
sns.barplot(df_taxi['PULocationID'].sort_values().value_counts().index, df_taxi['PULocationID'].value_counts())
plt.title('Taxi Dataset PULocationID')
plt.show()
sns.barplot(df_fhv['PULocationID'].sort_values().value_counts().index, df_fhv['PULocationID'].value_counts())
plt.title('FHV Dataset PULocationID')
plt.show()

Under the assumption that the uber dataset is a sample that accurately reflects the true distribution of PULocationID in the real world, the tot number of locationID will be adjusted to range in between 0-0.5. 

In [None]:
taxi_values_adj= df_taxi['PULocationID'].sort_values().value_counts()/df_taxi['PULocationID'].value_counts().sum()
fhv_values_adj = df_fhv['PULocationID'].sort_values().value_counts()/df_fhv['PULocationID'].value_counts().sum()
total_values_adj= taxi_values_adj.add(fhv_values_adj, fill_value=0)

In [None]:
sns.barplot(total_values_adj.index, total_values_adj)
plt.title('Taxi Dataset PULocationID')
plt.show()

Retrieve the top 25 PULocation ID 

In [None]:
#create df with index (LocationID) as a column
df = pd.DataFrame({
     'LocationID': total_values_adj.index,
     'Count': total_values_adj
 })
df= df[0:-1] #remove invalid last entry

# sort by highest percentage to lowest and get the top 25 LocationID
df= df.sort_values(by='Count', ascending=False)
top25= df['LocationID'].head(25)
#top50= df['LocationID'].head(50)
#delete for memory
del df

In [None]:
df_taxi.head()

## Visualising Geographic data

In [None]:
# Convert the geometry shaape to to latitude and longitude
# Please attribute this if you are using it
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [None]:
sf.head()

In [None]:
gdf = gpd.GeoDataFrame(pd.merge(df, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)

In [None]:
gdf.sample(2)

In [None]:
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()

In [None]:
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on how to plot aggregated data.
m.add_child(folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
))

m.save('../plots/foliumChoroplethMap.html')

In [None]:
import json

# an example of what the geoJSON looks like
json.loads(geoJSON)

In [None]:
gdf[['LocationID','total_amount']].groupby('LocationID').sum().reset_index()

In [None]:
m_trip_distance = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=gdf, # data source
    columns=['LocationID','total_amount'], # the columns required
    key_on='properties.LocationID', # this is from the geoJSON's properties
    fill_color='OrRd', # color scheme
    fill_opacity=0.9,
    line_opacity=0.5,
    legend_name='Trips' # legend title
).add_to(m_trip_distance)

m_trip_distance.save('../plots/foliumChoroplethMapTrips.html')

In [None]:
df_fhv.head()

In [None]:
def latitude_to_mercator(coords):
    """
    Function which converts an array of latitude coordinates 
    into its mercator coordinate representation
    """
    k = 6378137
    converted = list()
    for lat in coords:
        converted.append(np.log(np.tan((90 + lat) * np.pi/360.0)) * k)
    return converted

def longitude_to_mercator(coords):
    """
    Function which converts an array of longitude coordinates 
    into its mercator coordinate representation
    """
    k = 6378137
    converted = list()
    for lon in coords:
        converted.append(lon * (k * np.pi/180.0))
    return converted

In [None]:
# mcoords = the middle coordinates for the map
pickup_geo_data= ['pickup_latitude', 'pickup_longitude']
mcoords = df_tot[pickup_geo_data].describe().loc[["50%"]].values[0]

# axis ranges
xRange = [df_tot['pickup_longitude'].min(), df_tot['pickup_longitude'].max()]
yRange = [df_tot['pickup_latitude'].min(), df_tot['pickup_latitude'].max()]



In [None]:
from bokeh.plotting import figure, show
from bokeh.tile_providers import get_provider, Vendors

# to display bokeh plots inside jupyter, we need to use output_notebook
from bokeh.io import reset_output, output_notebook

reset_output()
output_notebook()
# note below that it says "BokehJS 1.4.0 successfully loaded."

In [None]:
TILE = get_provider("STAMEN_TERRAIN_RETINA")

pickup_m = figure(x_range=longitude_to_mercator(xRange), y_range=latitude_to_mercator(yRange),
       x_axis_type="mercator", y_axis_type="mercator")
pickup_m.add_tile(TILE)
pickup_m.title.text = "Pickups in NYC"

In [None]:
#convert to merccer
df_tot['pickupX'] = df_tot['pickup_longitude'].apply(lambda x: longitude_to_mercator([x])[0])
df_tot['pickupY'] = df_tot['pickup_latitude'].apply(lambda x: latitude_to_mercator([x])[0])
df_tot[['pickupX','pickupY']]

In [None]:
# for every source value, draw a small circle denoting a pickup
pickup_m.circle(x='pickupX', y='pickupY', 
         size=5, fill_color="blue", fill_alpha=0.5, 
         source=df_tot[['pickupX','pickupY']])
show(pickup_m)

In [None]:
#for drop offs
# create map
dropoff = figure(x_range=longitude_to_mercator(xRange), y_range=latitude_to_mercator(yRange),
       x_axis_type="mercator", y_axis_type="mercator")
dropoff.add_tile(TILE)
dropoff.title.text = "Dropoff in NYC"

# convert to mercer
df_tot['dropoffX'] = df_tot['dropoff_longitude'].apply(lambda x: longitude_to_mercator([x])[0])
df_tot['dropoffY'] = df_tot['dropoff_latitude'].apply(lambda x: latitude_to_mercator([x])[0])

# plot circles (source = data source)
dropoff.circle(x='dropoffX', y='dropoffY', 
         size=5, color="pink", fill_color="red", fill_alpha=0.5, 
         source=df_tot[['dropoffX','dropoffY']])

show(dropoff)

In [None]:
show(dropoff)

In [None]:
df_tot['tpep_trip_totaltime']= df_tot['tpep_dropoff_datetime'] - df_tot['tpep_pickup_datetime']

In [None]:
df_tot.describe().round()

In [None]:
num_clusters = 20
km = KMeans(n_clusters=num_clusters)
km.fit(data)

centers = km.cluster_centers_

km_loc_pickup= figure(x_range=longitude_to_mercator(xRange), y_range=latitude_to_mercator(yRange),
       x_axis_type="mercator", y_axis_type="mercator")
km_loc_pickup.add_tile(TILE)
km_loc_pickup.title.text = "Pickups in NYC"

# plot centroid / cluster center / group mean for each group
clus_xs = []
clus_ys = []

#we get the  cluster x / y values from the k-means algorithm
for entry in centers:
    clus_xs.append(entry[0])
    clus_ys.append(entry[1])

# the cluster center is marked by a circle, with a cross in it
km_loc_pickup.circle_cross(x=clus_xs, y=clus_ys, size=40, fill_alpha=0, line_width=2, color= "red")


# plot circles (source = data source)
km_loc_pickup.circle(x='pickupX', y='pickupY', 
         size=5, color="pink", fill_color="red", fill_alpha=0.5, 
         source=df_tot[['pickupX','pickupY']])

show(km_loc_pickup.circle)

In [None]:
#Location Data