# Exercise 2.5 Task

## 01 - 02. Import Libraries and Data

In [None]:
# Import Libraries

import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [None]:
# Import Data

df = pd.read_csv('newyork_data.csv', index_col = 0)

## 03. Data Preprocessing

In [None]:
df.head(10)

In [None]:
# Create a value column and group by start and end station 

df['value'] = 1
df_group = df.groupby(['start_station_name', 'end_station_name'])['value'].count().reset_index()

In [None]:
df_group

In [None]:
print(df_group['value'].sum())
print(df.shape)

In [None]:
df_group['value'].describe()

In [None]:
# Rename column

df_group.rename(columns = {'value': 'trips'}, inplace = True)

In [None]:
df_group

In [None]:
df_group['trips'].describe()

### Exploratory Analysis for Trips

In [None]:
df_group['trips'].hist(bins = 100)
plt.show()

In [None]:
df_group['trips'].sort_index().value_counts()

In [None]:
df_group.to_csv('newyork_stationgroupby_final.csv')

In [None]:
df_group.head()

In [None]:
df.head()

### Merge Trip DataFrame with Main DataFrame

In [None]:
df_m = df.merge(
    df_group,
    how='left',
    on=['start_station_name', 'end_station_name']
)

In [None]:
df_m

In [None]:
print(df_m['trips'].sum())
print(df_m.shape)

In [None]:
df_m.head()

### Create smaller subset of df 

In [None]:
df_final = df_m[[
    'start_station_name',
    'end_station_name',
    'start_lat',
    'start_lng',
    'end_lat',
    'end_lng',
    'trips'
]].drop_duplicates(subset=['start_station_name', 'end_station_name'])

In [None]:
df_final.head()

## 04. Kepler.gl map

In [None]:
from ipywidgets import IntSlider
IntSlider()

In [None]:
df_final.dtypes

In [None]:
df_final.head()

In [None]:
# Looking for missing values

df_final.isnull().sum()

#### Dataframe clean up

In [None]:
# Replace NaNs in station name with an empty string

df_final['start_station_name'] = df_final['start_station_name'].fillna('Unknown Start Station')

In [None]:
# Replace NaNs in station name with an empty string

df_final['end_station_name'] = df_final['end_station_name'].fillna('Unknown End Station')

In [None]:
df_final.columns

In [None]:
# Looking for missing values

df_final.isnull().sum()

In [None]:
# Handling missing values in latitiude and longitude. Since we need those to properly map, I'm creating a new df without these

df_clean_final = df_final.replace([np.inf, -np.inf], np.nan)

In [None]:
# Drop rows missing coordinates

df_clean_final = df_final.dropna(subset=['end_lat', 'end_lng'])

In [None]:
# Looking for missing values

df_clean_final.isnull().sum()

In [None]:
# Drop rows missing trips

df_clean_final = df_final.dropna(subset=['trips'])

In [None]:
# Looking for missing values

df_clean_final.isnull().sum()

In [None]:
print(df_clean_final['trips'].sum())
print(df_clean_final.shape)

#### Plotting the map

In [None]:
df_clean_final['trips'].value_counts()

In [None]:
df_clean_final.head()

In [None]:
# Create KeplerGl instance

m = KeplerGl(height = 700, data={"data_1": df_clean_final})
m

### Updates made to map:
#### For the Start Filter: I removed color based on "Trips" filter. I also changed the start color to purple to differentiate from the stop color, which I changed to teal. 
#### For the Stop Filter: I unhid the end filter and removed color based on trips. 
#### I changed to show the arc of start to end and changed the colors to purple and teal
#### I added filter on trips and changed colors to purple and teal

### Most popular trips made: 
#### The most popular trips are from the start station of Hoboken Terminal - Hudson St and Hudson Pl with the end station of Hoboken Ave at Monmouth St, with the total trips being 5,565.

### Additional Impressions: 
#### There appears to be a lot of single ride trips, all over the city. A large chunk of data have start stations on one side of the Hudson River and end on the other side, which would suggest a lot of riders are using Citi Bikes to get aross the river. The data indicates that a majority of the rides start in Jersey City/Hoboken/New Jersey area and ride across the George Washington Bridge via the dedicated bike path to New York City/Manahattan. 

## 07. Creating a config

In [None]:
config = m.config

In [None]:
config

In [None]:
import json
with open("config.json", "w") as outfile:
    json.dump(config, outfile)

In [None]:
m.save_to_html(file_name = 'New York Citi Bikes Bike Trips Aggregated.html', read_only = False, config = config)