In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [None]:
import folium
from folium.plugins import FastMarkerCluster

# lat, long
COORDS = ['pickup_latitude', 'pickup_longitude']

# create an interactive geospatial graph
pickups_cluster = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
#pickups_cluster

In [None]:
# read in the data
df = pd.read_feather("../preprocessed_data/cleaned_yellow_19.feather").drop('index', axis=1)

df.tail()

In [None]:
df.groupby('PULocationID')['final_amount'].count().reset_index().sort_values('final_amount', ascending=False)

In [None]:
df

In [None]:
import geopandas as gpd

# sf stands for shape file
sf = gpd.read_file("../raw_data/taxi_zones/taxi_zones.shp")
zone = pd.read_csv("../raw_data/taxi_zones/taxi+_zone_lookup.csv")

# Convert the geometry shaape to to latitude and longitude
# Please attribute this if you are using it
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [None]:
sf

In [None]:
sf.head()

In [None]:
gdf = gpd.GeoDataFrame(pd.merge(df, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
gdf.sample(2)

In [None]:
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()

In [None]:
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on how to plot aggregated data.
m.add_child(folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
))

m.save('../plots/foliumChoroplethMap.html')
m

In [None]:
import json

# an example of what the geoJSON looks like
json.loads(geoJSON)

In [None]:
gdf.loc[gdf['total_amount'] < 0]

In [None]:
gdf[['LocationID','total_amount']].groupby('LocationID').sum().reset_index()

In [None]:
m_trip_distance = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=gdf, # data source
    columns=['LocationID','total_amount'], # the columns required
    key_on='properties.LocationID', # this is from the geoJSON's properties
    fill_color='OrRd', # color scheme
    fill_opacity=0.9,
    line_opacity=0.5,
    legend_name='Trips' # legend title
).add_to(m_trip_distance)

m_trip_distance.save('../plots/foliumChoroplethMapTrips.html')
m_trip_distance

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_feather("../preprocessed_data/cleaned_yellow_19.feather").drop('index', axis=1)
# describe our data
df


In [None]:
df

In [None]:
df[['trip_distance','fare_amount']].plot.scatter(x='trip_distance',
                                                  y='fare_amount')
plt.show()

In [None]:
sns.distplot(df['fare_amount'], bins=30)

plt.show()

In [None]:
from numpy import log, sqrt

# apply a log transformation for all x non-zero x points, else 0
def logify(x):
    return log(x) if x else 0

sns.distplot(df['fare_amount'].apply(logify), bins=50)
plt.show()

In [None]:
sns.distplot(df['fare_amount'].apply(lambda x: log(x) if x else 0), bins=50)
plt.show()

In [None]:
data = df.loc[df['trip_distance'] <= 15, 'trip_distance']

sns.distplot(data, bins=30)
plt.show()

In [None]:
# pearson (by default) correlation table for distance and fare amount
df[['trip_distance','fare_amount']].corr(method='pearson')

In [None]:
sns.heatmap(df.corr())
# wow that's easy...

plt.show()

In [None]:
CORR_COLS = ["trip_distance", "fare_amount", "tip_amount","total_amount","time_used","avg_speed","final_amount","path"]

df[CORR_COLS].corr()

In [None]:
sns.heatmap(df[CORR_COLS].corr())
plt.show()

In [None]:
# dataframes method that may be of use
MAX = df['final_amount'].max()
MIN = df['final_amount'].min()
SD = df['final_amount'].std()
IQR = df['final_amount'].quantile()
N = len(df)

In [None]:
from numpy import log, log2

def sturges(x):
    return int(log2(x)) + 1

def rice(x):
    return int(2 * x ** (1/3))

def scott(large, small, sd, x):
    return int((large - small) / (3.5 * (sd/x ** (1/3))))

def freedman(large, small, iqr, x):
    return int((large - small) / (2 * (iqr/x ** (1/3))))
    
def square(x):
    return int(sqrt(x))

def logify(x):
    return log(x) if x else 0

In [None]:
fig1 = sns.distplot(df['final_amount'], bins=sturges(N))
plt.title("Sturges Binnings")
plt.show()

fig2 = sns.distplot(df['final_amount'], bins=rice(N))
plt.title("Rice Binnings")
plt.show()

fig3 = sns.distplot(df['final_amount'], bins=scott(MAX, MIN, SD, N))
plt.title("Scott Binnings")
plt.show()

fig4 = sns.distplot(df['final_amount'], bins=freedman(MAX, MIN, IQR, N))
plt.title("Freedman Binnings")
plt.show()

fig5 = sns.distplot(df['final_amount'], bins=square(N))
plt.title("Square Binnings")
#plt.show()

In [None]:
fig1 = sns.distplot(df['final_amount'].apply(logify), bins=sturges(N))
plt.title("log Sturges Binnings")
plt.show()

fig2 = sns.distplot(df['final_amount'].apply(logify), bins=rice(N))
plt.title("log Rice Binnings")
plt.show()

fig3 = sns.distplot(df['final_amount'].apply(logify), bins=scott(MAX, MIN, SD, N))
plt.title("log Scott Binnings")
plt.show()

fig4 = sns.distplot(df['final_amount'].apply(logify), bins=freedman(MAX, MIN, IQR, N))
plt.title("log Freedman Binnings")
plt.show()

fig5 = sns.distplot(df['final_amount'].apply(logify), bins=square(N))
plt.title("log Square Binnings")
plt.show()