#### City Bike Analytics Workbook

In [None]:
import pandas as pd

In [None]:
# specify data path
jan = "../Tableau-CitiBike-Visualization/Data/JC-202101-citibike-tripdata.csv"
feb = "../Tableau-CitiBike-Visualization/Data/JC-202102-citibike-tripdata.csv"
mar = "../Tableau-CitiBike-Visualization/Data/JC-202103-citibike-tripdata.csv"

# read csv file into jupyter notebook; read datetime columns correctly by using 'parse_dates' 
jan_data = pd.read_csv(jan)
#                        , parse_dates=['starttime', 'stoptime'])
feb_data = pd.read_csv(feb)
#                        , parse_dates=['started_at', 'ended_at'])
mar_data = pd.read_csv(mar)
#                        , parse_dates=['started_at', 'ended_at'])

In [None]:
# Splitting timestamp columns into separate date and time columns

jan_data['start_date'] = pd.to_datetime(jan_data['starttime']).dt.date
jan_data['start_time'] = pd.to_datetime(jan_data['starttime']).dt.time
jan_data['end_date'] = pd.to_datetime(jan_data['stoptime']).dt.date
jan_data['end_time'] = pd.to_datetime(jan_data['stoptime']).dt.time

feb_data['start_date'] = pd.to_datetime(feb_data['started_at']).dt.date
feb_data['start_time'] = pd.to_datetime(feb_data['started_at']).dt.time
feb_data['end_date'] = pd.to_datetime(feb_data['ended_at']).dt.date
feb_data['end_time'] = pd.to_datetime(feb_data['ended_at']).dt.time
mar_data['start_date'] = pd.to_datetime(mar_data['started_at']).dt.date
mar_data['start_time'] = pd.to_datetime(mar_data['started_at']).dt.time
mar_data['end_date'] = pd.to_datetime(mar_data['ended_at']).dt.date
mar_data['end_time'] = pd.to_datetime(mar_data['ended_at']).dt.time

In [None]:
feb_data.head(1)

In [None]:
feb_data.dtypes

In [None]:
# rename columns
new_jan_df = jan_data
new_jan_df.rename(columns={"tripduration":"trip_duration",
                           "starttime":"started_at",
                           "stoptime":"ended_at",
                           "start station id":"start_station_id",
                           "end station id":"end_station_id",
                           "start station name":"start_station_name",
                           "end station name":"end_station_name",
                           "start station latitude":"start_lat",
                           "end station latitude":"end_lat",
                           "start station longitude":"start_lng",
                           "end station longitude":"end_lng",
                           "start station name":"start_station_name",
                           "end station name":"end_station_name"
                        }, inplace=True)

In [None]:
# drop columns to harmonize columns with feb and mar datasets
new_jan_df.drop(["trip_duration", "usertype", "bikeid", "birth year", "gender"], axis=1, inplace=True)

In [None]:
# combine feb & mar df into one df since the df heads are the same
new_feb_mar_df = pd.concat([feb_data, mar_data])

In [None]:
# drop columns to harmonize columns with jan dataset
new_feb_mar_df.drop(["ride_id", "rideable_type", "member_casual"], axis=1, inplace=True)

In [None]:
# merge two dataframes together
merge_df = pd.concat([new_jan_df, new_feb_mar_df], axis=0, sort=False)

In [None]:
# Add ride id column to uniquely identify each ride
merge_df["rideid"] = merge_df["start_station_id"].map(str) + "_" + merge_df["end_station_id"].map(str)
merge_df.head(1)

In [None]:
merge_df = merge_df.astype({"start_date": str})

# break out year
merge_df['trip_year'] = merge_df.start_date.str.split("-", expand=True)[0]
merge_df.head(1)

In [None]:
merge_df.dtypes

In [None]:
# change column dtype to string
# merge_df = merge_df.astype({"start_date": str})

# break out month
merge_df['trip_month'] = merge_df.start_date.str.split("-", expand=True)[1]

In [None]:
# merge_df['end_time'] = pd.to_datetime(merge_df['start_time'], errors='coerce')
# merge_df['start_time'] = pd.to_datetime(merge_df['start_time'], errors='coerce')

In [None]:
# calculated travel time between bike stations
# merge_df['travel_time (min)'] = merge_df['end_time'].dt.minute - merge_df['start_time'].dt.minute

merge_df.head(2)

In [None]:
merge_df["trip_month"].replace({"01": "January", "02": "February", "03": "March"}, inplace=True)
merge_df.head(1)

In [None]:
# drop rows that have blank or NaN values
merge_df.dropna(subset=["end_lat", "end_lng", "end_station_name"], inplace = True)

# defective bikes are likely to be returned to the same station
# drop rows where both start and end stations are the same
merge_df = merge_df[merge_df["start_station_name"] != merge_df["end_station_name"]]

###### Calculate distance of each ride point a to b.

In [None]:
from math import sin, cos, sqrt, atan2, radians

df = merge_df

# For lat and lon, convert degrees to radians
lat1 = [radians(lat) for lat in df["start_lat"]]
lon1 = [radians(lon) for lon in df["start_lng"]]
lat2 = [radians(lat) for lat in df["end_lat"]]
lon2 = [radians(lon) for lon in df["end_lng"]]

# Convert lists into series
lat1 = pd.Series(lat1)
lon1 = pd.Series(lon1)
lat2 = pd.Series(lat2)
lon2 = pd.Series(lon2)

# Calculate difference between each set of latitude and longitude
dlat = lat2 - lat1
dlon = lon2 - lon1

In [None]:
# Approximate radius of Earth in km
R = 6373.0

# Empty list to store trip distances
distance = []

for i in range(0, len(lat1)):
    
    a = sin(dlat[i] / 2)**2 + cos(lat1[i]) * cos(lat2[i]) * sin(dlon[i] / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    # Get distance and convert km to miles
    miles = (R * c) * .6214
    
    # Append miles travel to 'distance' list
    distance.append(miles)

In [None]:
# Add trip distance as new column to data frame
df.loc[:, "trip_distance (mi)"] = distance

# df["tripdistance (mi)"] = distance
df

In [None]:
df.to_csv("../Tableau-CitiBike-Visualization/Output/upload_to_tableau.csv")

In [None]:
trips_df = df.groupby(["start_station_name","end_station_name"]).size().reset_index(name = "Number_of_Trips")

trips_df.to_csv("../Tableau-CitiBike-Visualization/Output/start_end_rides.csv")