In [90]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline

# Using Divvy data

## Load and concat the Divvy datasets

In [None]:
# divvy1 = pd.read_csv('data/Divvy_Trips_2017_Q1Q2/Divvy_Trips_2017_Q1.csv', parse_dates=['start_time', 'end_time'])
# divvy2 = pd.read_csv('data/Divvy_Trips_2017_Q1Q2/Divvy_Trips_2017_Q2.csv', parse_dates=['start_time', 'end_time'])
# divvy3 = pd.read_csv('data/Divvy_Trips_2017_Q3Q4/Divvy_Trips_2017_Q3.csv', parse_dates=['start_time', 'end_time'])
# divvy4 = pd.read_csv('data/Divvy_Trips_2017_Q3Q4/Divvy_Trips_2017_Q4.csv', parse_dates=['start_time', 'end_time'])

# divvy = pd.concat([divvy1, divvy2, divvy3, divvy4], ignore_index=True)

# divvy.to_csv('data/divvy_2017.csv')

In [None]:
divvy = pd.read_csv('data/divvy_2017.csv')

In [None]:
divvy

In [None]:
station_list = set(list(divvy['from_station_name'].unique()) + list(divvy['to_station_name'].unique()))
station_list

In [None]:
len(station_list)

## Load Divvy station info for GPS coordinates

In [None]:
with open('data/stations.json') as json_data:
    station_data = json.load(json_data)

In [None]:
station_data['stationBeanList']

In [None]:
stations = [station['stationName'] for station in station_data['stationBeanList']]
latitude = [station['latitude'] for station in station_data['stationBeanList']]
longitude = [station['longitude'] for station in station_data['stationBeanList']]

In [None]:
len(stations)

In [None]:
unknown = []
for station in station_list:
    if station not in stations:
        unknown.append(station)
unknown

In [None]:
station_gps = pd.DataFrame({'station_name': stations, 'latitude': latitude, 'longitude': longitude})

In [None]:
station_gps.head()

In [None]:
def gps_lookup(location):
    match = (station_gps['station_name'] == location)
    coord = station_gps['latitude'][match]
    if len(coord) > 0:
        return pd.Series([coord.values[0], station_gps['longitude'][match].values[0]])
    else:
        return pd.Series([np.nan, np.nan])

In [None]:
divvy[['from_station_latitude', 'from_station_longitude']] = divvy['from_station_name'].apply(gps_lookup)
divvy[['to_station_latitude', 'to_station_longitude']] = divvy['to_station_name'].apply(gps_lookup)

In [None]:
divvy

In [None]:
divvy.to_csv('data/divvy_2017.csv')

In [None]:
divvy = pd.read_csv('data/divvy_2017.csv')

### 1) Top 5 stations with the most starts (showing # of starts)

In [None]:
station_starts = divvy.groupby(['from_station_name'])['from_station_name'].count().sort_values(ascending=False)
station_starts.head()

In [None]:
ax = station_starts.head(5).plot(kind='bar', figsize=(15, 10), title='Top 5 Stations with Most Starts')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005 + .15, p.get_height() * 1.005))

### 2) Trip duration by user type

In [None]:
trip_duration = divvy.groupby(['usertype'])['tripduration'].mean().sort_values(ascending=False)
trip_duration

In [None]:
ax = divvy.boxplot(column='tripduration', by='usertype', figsize=(15,10))

In [None]:
ax = divvy.boxplot(column='tripduration', by='usertype', figsize=(15,10), showfliers=False)

### 3) Most popular trips based on start station and stop station

In [None]:
divvy['trip_stations'] = divvy['from_station_name'] + ' TO ' + divvy['to_station_name']

In [None]:
trip_stations = divvy.groupby(['trip_stations'])['trip_stations'].count().sort_values(ascending=False)
trip_stations.head()

In [None]:
ax = trip_stations.head(10).plot(kind='bar', figsize=(15, 10), title='Top 10 Most Popular Trips')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005 + .05, p.get_height() * 1.005))

### Create dataframe of paths for Tableau chart

In [None]:
trip_dict = {'path': list(trip_stations.index), 'frequency': list(trip_stations.values)}
trip_dict['origin'] = [x.split(' TO ')[0] for x in trip_dict['path']]
trip_dict['destination'] = [x.split(' TO ')[1] for x in trip_dict['path']]

In [None]:
from collections import defaultdict

trips = defaultdict(list)

for idx in range(len(trip_dict['path'])):
    trips['path'].append(trip_dict['path'][idx])
    trips['frequency'].append(trip_dict['frequency'][idx])
    trips['origin-destination'].append('origin')
    trips['station'].append(trip_dict['origin'][idx])
    
    trips['path'].append(trip_dict['path'][idx])
    trips['frequency'].append(trip_dict['frequency'][idx])
    trips['origin-destination'].append('destination')
    trips['station'].append(trip_dict['destination'][idx])

In [None]:
stations_gps = (divvy[divvy['from_station_name'].duplicated()]
                [['from_station_name', 'from_station_latitude', 'from_station_longitude']].reset_index(drop=True))

In [None]:
trips_df = pd.DataFrame(trips).merge(stations_gps.drop_duplicates(subset=['from_station_name']),
                                     how='left',
                                     left_on='station',
                                     right_on='from_station_name')

In [None]:
trips_df.drop(['from_station_name'], axis=1, inplace=True)

In [None]:
trips_df.to_csv('data/trips.csv')

### 4) Rider performance by Gender and Age based on avg trip distance (station to station), median speed (distance traveled / trip duration)

Multiply geodesic distance by 1.25. Routes follow roads but the calculated route is direct (geodesic). A route straight down a road would be the same as the direct route; a route diagnoal to roads would be multiplied by 1.414 (thanks, Pythagoras!); assuming routes are evenly split between diagonal and direct, with some wiggle room, I'm splitting the difference at 1.25.

I looked at using the Google Maps api to calculate the actual, along-the-road distance, but they've removed the free api key option. I also looked at Bing Maps, but it's rate limited and I have more than 98,000 routes in this dataset (and once I saw how big that number was, I realized that using api calls would take more than a few days!). So I opted for this *x1.25* method which is less accurate but far quicker and cheaper.

In [None]:
from math import isnan

def find_distance(row):
    if (not isnan(row['from_station_latitude']) and
        not isnan(row['from_station_longitude']) and
        not isnan(row['to_station_latitude']) and
        not isnan(row['to_station_longitude'])):
        return (1.25 * (geopy.distance.distance((row['from_station_latitude'], row['from_station_longitude']),
                                                (row['to_station_latitude'], row['to_station_longitude'])).m))
    return np.nan

In [None]:
from math import isnan

def find_distance(row):
    if (not isnan(row['from_station_latitude']) and
        not isnan(row['from_station_longitude']) and
        not isnan(row['to_station_latitude']) and
        not isnan(row['to_station_longitude'])):
        distance = (1.25 * (geopy.distance.distance((row['from_station_latitude'], row['from_station_longitude']),
                                                (row['to_station_latitude'], row['to_station_longitude'])).m))
        print(distance)
        return distance
    else:
        print(np.nan)
    return np.nan

In [None]:
divvy['trip_distance'] = divvy.apply(find_distance, axis=1)
divvy['speed'] = divvy['trip_distance'] / divvy['tripduration']

In [None]:
divvy.to_csv('data/divvy_2017.csv')

In [None]:
divvy = pd.read_csv('data/divvy_2017.csv')

# Yelp data

In [19]:
business = pd.read_json('data/yelp_dataset/yelp_academic_dataset_business.json', lines=True)

checkin = pd.read_json('data/yelp_dataset/yelp_academic_dataset_checkin.json', lines=True)

max_records = 1e5
df = pd.read_json('data/yelp_dataset/yelp_academic_dataset_review.json', lines=True, chunksize=max_records)
review = pd.DataFrame() # Initialize the dataframe
try:
    for df_chunk in df:
        review = pd.concat([review, df_chunk])
except ValueError:
    print ('\nSome messages in the file cannot be parsed')

In [3]:
business

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV
2,1335 rue Beaubien E,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",Montréal,"{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",0,45.540503,-73.599300,La Bastringue,Rosemont-La Petite-Patrie,H2G 1K7,5,4.0,QC
3,211 W Monroe St,,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,,1,33.449999,-112.076979,Geico Insurance,,85003,8,1.5,AZ
4,2005 Alyth Place SE,{'BusinessAcceptsCreditCards': 'True'},8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",Calgary,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,51.035591,-114.027366,Action Engine,,T2H 0N5,4,2.0,AB
5,"20235 N Cave Creek Rd, Ste 1115","{'BikeParking': 'True', 'BusinessAcceptsCredit...",45bWSZtniwPRiqlivpS8Og,"Coffee & Tea, Food",Phoenix,"{'Monday': '5:30-20:0', 'Tuesday': '5:30-20:0'...",1,33.671375,-112.030017,The Coffee Bean & Tea Leaf,,85024,63,4.0,AZ
6,631 Bloor St W,"{'BusinessParking': '{'garage': False, 'street...",9A2quhZLyWk0akUetBd8hQ,"Food, Bakeries",Toronto,,0,43.664378,-79.414424,Bnc Cake House,Koreatown,M6G 1K8,7,4.0,ON
7,"3417 Derry Road E, Unit 103","{'Alcohol': 'none', 'BusinessAcceptsCreditCard...",6OuOZAok8ikONMS_T3EzXg,"Restaurants, Thai",Mississauga,,1,43.712946,-79.632763,Thai One On,Ridgewood,L4T 1A8,7,2.0,ON
8,1440 N. Dysart Ave,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",8-NRKkPY1UiFXW20WXKiXg,"Mexican, Restaurants",Avondale,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1,33.448106,-112.341302,Filiberto's Mexican Food,,85323,40,2.5,AZ
9,209 Oakland Ave,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",UTm5QZThPQlT35mkAcGOjg,"Flowers & Gifts, Gift Shops, Shopping",Pittsburgh,"{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...",1,40.441421,-79.956457,Maggie & Stella's Gifts,Oakland,15213,3,3.5,PA


In [7]:
checkin

Unnamed: 0,business_id,time
0,7KPBkxAOEtb3QeIL9PEErg,"{'Fri-0': 2, 'Sat-0': 1, 'Sun-0': 1, 'Wed-0': ..."
1,kREVIrSBbtqBhIYkTccQUg,"{'Mon-13': 1, 'Thu-13': 1, 'Sat-16': 1, 'Wed-1..."
2,tJRDll5yqpZwehenzE2cSg,"{'Thu-0': 1, 'Mon-1': 1, 'Mon-12': 1, 'Sat-16'..."
3,tZccfdl6JNw-j5BKnCTIQQ,"{'Sun-14': 1, 'Fri-18': 1, 'Mon-20': 1}"
4,r1p7RAMzCV_6NPF0dNoR3g,"{'Sat-3': 1, 'Sun-18': 1, 'Sat-21': 1, 'Sat-23..."
5,OVkBMWxUBXGoO4K3_CeJ7g,"{'Sat-17': 1, 'Sat-18': 1}"
6,fW1SKSunVnlaq-fxZ-gSUQ,"{'Sun-0': 1, 'Thu-0': 1, 'Sun-1': 1, 'Sat-15':..."
7,mDdqgfrvROGAumcQdZ3HIg,"{'Fri-2': 1, 'Fri-11': 1, 'Thu-11': 3, 'Mon-12..."
8,nhZ1HGWD8lMErdn3FuWuTQ,"{'Fri-0': 1, 'Sat-0': 1, 'Sun-0': 1, 'Thu-0': ..."
9,vDoXZGE7p6xAkKQ0XQPvoA,"{'Thu-15': 1, 'Sat-23': 1}"


In [331]:
review

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0.0,x7mDIiDB3jEiPGPHOmDzyw,2.0,The pizza was okay. Not the best I've had. I p...,0.0,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13,0.0,dDl8zu1vWPdKGihJrwQbpw,5.0,I love this place! My fiance And I go here atl...,0.0,msQe1u7Z_XuqjGoqhB0J5g
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23,1.0,LZp4UX5zK3e-c5ZGSeo3kA,1.0,Terrible. Dry corn bread. Rib tips were all fa...,3.0,msQe1u7Z_XuqjGoqhB0J5g
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25,0.0,Er4NBWCmCD4nM8_p1GRdow,2.0,Back in 2005-2007 this place was my FAVORITE t...,2.0,msQe1u7Z_XuqjGoqhB0J5g
4,Ums3gaP2qM3W1XcA5r6SsQ,0,2014-09-05,0.0,jsDu6QEJHbwP2Blom1PLCA,5.0,Delicious healthy food. The steak is amazing. ...,0.0,msQe1u7Z_XuqjGoqhB0J5g
5,vgfcTvK81oD4r50NMjU2Ag,0,2011-02-25,0.0,pfavA0hr3nyqO61oupj-lA,1.0,This place sucks. The customer service is horr...,2.0,msQe1u7Z_XuqjGoqhB0J5g
6,AxeQEz3-s9_1TyIo-G7UQw,0,2011-10-10,0.0,brokEno2n7s4vrwmmUdr9w,5.0,"If you like Thai food, you have to try the ori...",1.0,msQe1u7Z_XuqjGoqhB0J5g
7,zdE82PiD6wquvjYLyhOJNA,0,2012-04-18,1.0,kUZWBVZvhWuC8TWUg5AYyA,5.0,AMAZING!!!\n\n I was referred here by a friend...,0.0,msQe1u7Z_XuqjGoqhB0J5g
8,EAwh1OmG6t6p3nRaZOW_AA,0,2011-02-25,0.0,wcqt0III88LEcm19IxFFyA,4.0,Ribs = amazing\n2 hour wait time= not so amazi...,0.0,msQe1u7Z_XuqjGoqhB0J5g
9,atVh8viqTj-sqDJ35tAYVg,1,2012-11-09,2.0,LWUtqzNthMM3vpWZIFBlPw,2.0,"Food is pretty good, not gonna lie. BUT you ha...",1.0,msQe1u7Z_XuqjGoqhB0J5g


In [46]:
checkin['checkins'] = checkin['time'].apply(lambda x : sum(x.values()))
yelp = business.merge(checkin, on='business_id', how='outer')

In [278]:
business.to_csv('data/yelp_business.csv', index=False)
checkin.to_csv('data/yelp_checkin.csv', index=False)
review.to_csv('data/yelp_review.csv', index=False)
yelp.to_csv('data/yelp.csv', index=False)

### Find the top 10 and bottom 10 restaurants in Illinois having most and least checkins respectively.

In [113]:
yelp['checkins'] = yelp['time'].apply(lambda x : sum(x.values()))

AttributeError: 'float' object has no attribute 'values'

In [114]:
checkin['checkins'] = checkin['time'].apply(lambda x : sum(x.values()))

In [49]:
yelp.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,time,checkins
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB,"{'Fri-0': 1, 'Sun-0': 3, 'Thu-0': 1, 'Fri-1': ...",29.0
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV,{'Sun-1': 1},1.0
2,1335 rue Beaubien E,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",Montréal,"{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",0,45.540503,-73.5993,La Bastringue,Rosemont-La Petite-Patrie,H2G 1K7,5,4.0,QC,"{'Mon-0': 1, 'Sun-0': 1, 'Fri-1': 1, 'Sun-1': ...",7.0
3,211 W Monroe St,,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,,1,33.449999,-112.076979,Geico Insurance,,85003,8,1.5,AZ,{'Wed-18': 1},1.0
4,2005 Alyth Place SE,{'BusinessAcceptsCreditCards': 'True'},8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",Calgary,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,51.035591,-114.027366,Action Engine,,T2H 0N5,4,2.0,AB,"{'Thu-14': 1, 'Thu-20': 1}",2.0


In [88]:
len(yelp[(yelp['state'] == 'IL') &
         (yelp['categories'].str.contains('Restaurants')) &
         (yelp['checkins'].isnull()) &
         (yelp['is_open'] == 1)])

11

In [86]:
len(yelp[(yelp['state'] == 'IL') &
         (yelp['categories'].str.contains('Restaurants')) &
         (yelp['checkins'] == 1) &
         (yelp['is_open'] == 1)])

15

In [345]:
len(yelp[(yelp['state'] == 'IL') &
         (yelp['categories'].str.contains('Restaurants')) &
         (yelp['checkins'] == 2) &
         (yelp['is_open'] == 1)])

12

In [346]:
len(yelp[(yelp['state'] == 'IL') &
         (yelp['categories'].str.contains('Restaurants')) &
         (yelp['checkins'] == 3) &
         (yelp['is_open'] == 1)])

10

In [342]:
top10 = yelp[(yelp['state'] == 'IL') &
             (yelp['categories'].str.contains('Restaurants') &
              (yelp['is_open'] == 1))
            ].sort_values('checkins', ascending=False).head(10)

bottom10 = yelp[(yelp['state'] == 'IL') &
                (yelp['categories'].str.contains('Restaurants') &
                (yelp['is_open'] == 1))
               ].sort_values('checkins', ascending=False).tail(10)

In [343]:
top10

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,time,checkins
121269,403 E Green St,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",-zEpEmDfFQL-ph0N3BDlXA,"Sushi Bars, Restaurants, Japanese",Champaign,"{'Monday': '17:0-21:30', 'Tuesday': '17:0-21:3...",1,40.110167,-88.232925,Sakanaya,,61820,358,4.5,IL,"{'Fri-0': 45, 'Mon-0': 41, 'Sat-0': 68, 'Sun-0...",2526.0
64052,201 N Broadway Ave,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",9MnbQg7kfb_WgxoV0hXKSQ,"Restaurants, Barbeque",Urbana,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,40.113818,-88.207689,Black Dog Smoke & Ale House,,61801,765,4.5,IL,"{'Fri-0': 18, 'Mon-0': 19, 'Sat-0': 29, 'Sun-0...",1357.0
108968,301 N Neil St,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",-fiUXzkxRfbHY9TKWwuptw,"Food, Restaurants, Breakfast & Brunch, Gluten-...",Champaign,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,40.118337,-88.243777,DESTIHL Restaurant & Brew Works,,61820,556,4.0,IL,"{'Fri-0': 21, 'Mon-0': 18, 'Sat-0': 29, 'Sun-0...",883.0
88115,32 E Chester St,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",L2c-qKZWumCmOCR-dqBLrg,"Restaurants, Burgers, American (Traditional), ...",Champaign,"{'Monday': '11:0-0:0', 'Tuesday': '11:0-2:0', ...",1,40.116556,-88.242013,Seven Saints,,61820,313,4.0,IL,"{'Fri-0': 11, 'Mon-0': 15, 'Sat-0': 16, 'Sun-0...",742.0
872,505 S Neil St,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",t_yiQnxUDdPPCN2z4QyezA,"Taiwanese, Chinese, Restaurants",Champaign,"{'Tuesday': '16:30-21:0', 'Wednesday': '16:30-...",1,40.111053,-88.244141,Golden Harbor Authentic Chinese Cuisine,,61820,386,4.5,IL,"{'Fri-0': 20, 'Mon-0': 19, 'Sat-0': 24, 'Sun-0...",689.0
874,60 E Green Street,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",VIJ2KiDKhUVhhpNylEIfog,"Mexican, Restaurants",Champaign,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",1,40.110409,-88.238955,Maize Mexican Grill,,61820,496,4.5,IL,"{'Fri-0': 10, 'Mon-0': 11, 'Sat-0': 12, 'Sun-0...",658.0
110111,2401 N Prospect Ave,"{'Ambience': '{'romantic': False, 'intimate': ...",XbHxWOciYlBhJOjKRQbo9g,"Grocery, Health & Medical, Restaurants, Food, ...",Champaign,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1,40.14512,-88.260689,Meijer,,61821,51,3.5,IL,"{'Fri-0': 1, 'Mon-0': 4, 'Sat-0': 7, 'Sun-0': ...",651.0
145863,109 N Walnut St,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",e0prCZXtHGQIKeQ_wTW3uw,"Cafes, Coffee & Tea, Desserts, Food, Salad, Re...",Champaign,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,40.116813,-88.242753,Cafe Kopi,,61820,162,4.0,IL,"{'Fri-0': 5, 'Mon-0': 2, 'Sat-0': 3, 'Sun-0': ...",569.0
5478,111 N Race St,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",o13eH93qmWVNFZogkjhd9w,"Burgers, Cafes, Restaurants, Food, American (T...",Urbana,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,40.112823,-88.209023,Courier Cafe,,61801,334,4.0,IL,"{'Fri-0': 2, 'Mon-0': 2, 'Sat-0': 9, 'Sun-0': ...",535.0
103050,"1 E Main St, Ste 101","{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",dn9lwYUxmhs_mLKPu7L25Q,"Restaurants, American (New), Cocktail Bars, Ba...",Champaign,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,40.118381,-88.242996,Big Grove Tavern,,61820,315,3.5,IL,"{'Fri-0': 17, 'Mon-0': 5, 'Sat-0': 19, 'Sun-0'...",529.0


In [344]:
bottom10

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,time,checkins
16807,1401 W Green St,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",IYASJOu_TXz8PpPbt-Clbg,Restaurants,Urbana,,1,40.109413,-88.227169,Sushi San,,61801,4,2.0,IL,,
34087,225 E Congress Ave,"{'BusinessAcceptsCreditCards': 'True', 'GoodFo...",83_gVj7cnJd0-J5ZoYN9qA,"Restaurants, Pizza",Rantoul,,1,40.309568,-88.156167,Papa John's Pizza,,61866,3,1.5,IL,,
42023,520 S Tanner St,"{'BusinessAcceptsCreditCards': 'True', 'Caters...",3XUH9aFxt4vU9Pzi5aBtow,"Chicken Wings, Sandwiches, Restaurants, Pizza",Rantoul,"{'Monday': '10:0-0:0', 'Tuesday': '10:0-0:0', ...",1,40.304083,-88.159115,Domino's Pizza,,61866,3,2.0,IL,,
56310,402 W Center St,,W7KtVjq4R_5F5EwbCBENQw,"Chinese, Restaurants",Monticello,"{'Monday': '11:0-21:30', 'Wednesday': '11:0-21...",1,40.028317,-88.574491,Golden Kitchen,,61856,4,4.5,IL,,
65257,109 South 3rd St,,21UO0mP1EgEDZyFUFFI9Mg,"Bars, Nightlife, American (Traditional), Pubs,...",Fisher,,1,40.317181,-88.349906,The Wild Hare,,61843,5,4.5,IL,,
67898,300 S Broadway Ave,"{'Alcohol': 'none', 'BusinessAcceptsCreditCard...",-yZ78Hd2DKDqvxJKbCyELg,"Caribbean, American (New), African, Restaurants",Urbana,"{'Tuesday': '11:0-20:0', 'Wednesday': '11:0-20...",1,40.110654,-88.207464,Stango Cuisine,,61801,3,5.0,IL,,
138245,408 W Green St,"{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",Pq_dAtQ4i1wkyWsEiFYrEA,"Mexican, Restaurants",Champaign,"{'Monday': '16:30-3:0', 'Tuesday': '16:30-3:0'...",1,40.110347,-88.249405,Tortica's Grill,,61820,12,3.0,IL,,
156146,115 S 1st St,{'BusinessAcceptsCreditCards': 'True'},gWZOW4-8N5dLixQAlp8iRg,"Event Planning & Services, Restaurants, Venues...",Champaign,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",1,40.115816,-88.239006,Manzellas Italian Restaurant,,61820,31,3.0,IL,,
178569,710 W Champaign Ave,,tkbRjBlZm7ngEVSu22n0vg,"Restaurants, Sandwiches",Rantoul,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,40.308798,-88.173072,Jimmy John's,,61866,3,3.5,IL,,
179289,201 N Lombard St,{'OutdoorSeating': 'False'},hjo4IYguwprIJ16A8lV75A,"Chicken Wings, Pizza, Restaurants, Sandwiches",Mahomet,"{'Monday': '10:30-23:0', 'Tuesday': '10:30-23:...",1,40.195866,-88.399785,Domino's Pizza,,61853,4,4.0,IL,,


In [332]:
top10.to_csv('data/yelp_top10_checkins.csv', index=False)
bottom10.to_csv('data/yelp_bottom10_checkins.csv', index=False)

In [255]:
top10_names = top10['name'].tolist()
top10_names

['Sakanaya',
 'Black Dog Smoke & Ale House',
 'DESTIHL Restaurant & Brew Works',
 'Seven Saints',
 'Golden Harbor Authentic Chinese Cuisine',
 'Maize Mexican Grill',
 'Meijer',
 'Cafe Kopi',
 'Courier Cafe',
 'Big Grove Tavern']

In [248]:
bottom10_names = bottom10['name'].tolist()
bottom10_names

['Sushi San',
 "Papa John's Pizza",
 "Domino's Pizza",
 'Golden Kitchen',
 'The Wild Hare',
 'Stango Cuisine',
 "Tortica's Grill",
 'Manzellas Italian Restaurant',
 "Jimmy John's",
 "Domino's Pizza"]

### For the top 10 and bottom 10 restaurants calculated in step 6, calculate the average star rating and average sentiment score of the reviews

In [263]:
top_bottom = top10['business_id'].tolist() + bottom10['business_id'].tolist()
top_bottom

['-zEpEmDfFQL-ph0N3BDlXA',
 '9MnbQg7kfb_WgxoV0hXKSQ',
 '-fiUXzkxRfbHY9TKWwuptw',
 'L2c-qKZWumCmOCR-dqBLrg',
 't_yiQnxUDdPPCN2z4QyezA',
 'VIJ2KiDKhUVhhpNylEIfog',
 'XbHxWOciYlBhJOjKRQbo9g',
 'e0prCZXtHGQIKeQ_wTW3uw',
 'o13eH93qmWVNFZogkjhd9w',
 'dn9lwYUxmhs_mLKPu7L25Q',
 'IYASJOu_TXz8PpPbt-Clbg',
 '83_gVj7cnJd0-J5ZoYN9qA',
 '3XUH9aFxt4vU9Pzi5aBtow',
 'W7KtVjq4R_5F5EwbCBENQw',
 '21UO0mP1EgEDZyFUFFI9Mg',
 '-yZ78Hd2DKDqvxJKbCyELg',
 'Pq_dAtQ4i1wkyWsEiFYrEA',
 'gWZOW4-8N5dLixQAlp8iRg',
 'tkbRjBlZm7ngEVSu22n0vg',
 'hjo4IYguwprIJ16A8lV75A']

In [265]:
sentiment = review[review['business_id'].isin(top_bottom)]

In [91]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def get_vader_scores(text):
    '''
    Takes a string of text and outputs four values for Vader's negative,
    neutral, positive, and compound (normalized) sentiment scores
    INPUT: a string
    OUTPUT: a dictionary of four sentiment scores
    '''

    analyser = SentimentIntensityAnalyzer()
    return analyser.polarity_scores(text)


def apply_vader(df, column):
    '''
    Takes a DataFrame with a specified column of text and adds four new columns
    to the DataFrame, corresponding to the Vader sentiment scores
    INPUT: DataFrame, string
    OUTPUT: the original DataFrame with four additional columns
    '''

    sentiment = pd.DataFrame(df[column].apply(get_vader_scores))
    unpacked = pd.DataFrame([d for idx, d in sentiment['text'].iteritems()],
                            index=sentiment.index)
    unpacked['compound'] += 1
    columns = {'neu': 'v_neutral', 'pos': 'v_positive', 'neg': 'v_negative'}
    unpacked.rename(columns=columns, inplace=True)
    return pd.concat([df, unpacked], axis=1)

In [271]:
sentiment = apply_vader(sentiment, 'text')

In [272]:
sentiment

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,compound,v_negative,v_neutral,v_positive
3885,9MnbQg7kfb_WgxoV0hXKSQ,0,2013-05-09,0,T-8c5_PK0g3t7Oaw3UpZwg,4,Great little spot! Cute and appropriately gru...,0,OxE0QBa6m1399S8BXigIsg,1.8967,0.000,0.704,0.296
4925,-fiUXzkxRfbHY9TKWwuptw,0,2017-01-31,0,Oty5ckJPff_XXe_hEsRNug,4,I love this place! Everything on the menu is s...,0,2ML0xKfrGxSWbGXxMtzBew,1.9498,0.038,0.544,0.418
4957,t_yiQnxUDdPPCN2z4QyezA,0,2010-07-13,0,yqNykqUA2eJ92ARhPfg7Tw,5,I can only comment on the vegetarian food. Fro...,2,sNUD7INzDJYtuj6jAuIMRg,1.9690,0.031,0.769,0.200
6278,9MnbQg7kfb_WgxoV0hXKSQ,0,2014-02-28,0,0UDtZGWxJTqxiT-dXqT5Kw,3,My biggest problem here was expectations every...,3,AeONlsZSC8fmOumdAnMiUA,1.9625,0.098,0.650,0.252
6918,9MnbQg7kfb_WgxoV0hXKSQ,0,2015-07-08,0,XueS2RqbcpMKmWY8Btj7Ow,4,"full of people, but did not wait for a lot tim...",0,WQtebIc8sOpC6Zr__AgQXQ,1.6474,0.000,0.816,0.184
12554,t_yiQnxUDdPPCN2z4QyezA,0,2014-08-24,0,-wtGaRE4ymGm7Ly7Am7oEw,1,This place is getting much worse than before a...,3,G-huUpDESJmQh2BLsPK0iA,0.0363,0.199,0.782,0.019
14142,e0prCZXtHGQIKeQ_wTW3uw,0,2011-04-29,0,Z1ciL1y41nFhgc9Bxdt-rg,4,"I love the atmosphere here, it's very chill. T...",0,uh2DXG80jJ7gxDTWi-2Q3Q,1.9489,0.000,0.665,0.335
19397,9MnbQg7kfb_WgxoV0hXKSQ,0,2012-10-01,0,F0BSmqEJzXHjvO05TG5P3g,5,"Burnt ends are excellent, and they always run ...",0,x51Ae_KL241qwBwimanagQ,1.9550,0.000,0.700,0.300
19400,t_yiQnxUDdPPCN2z4QyezA,2,2012-10-01,0,7_IPU144jeMAIp51FkEENw,5,Best Chinese food in driving range of UIUC! Be...,2,x51Ae_KL241qwBwimanagQ,1.9362,0.024,0.827,0.149
21072,L2c-qKZWumCmOCR-dqBLrg,0,2008-05-10,0,IFRTurfgkLPWMnOcF6cE1Q,3,My first few trips to Seven Saints were excell...,2,zNDWvPyyByDk8CmQ6jAK9w,1.9890,0.030,0.844,0.126


In [322]:
top10_sentiment = {}
for biz_id in top10['business_id'].tolist():
    top10_sentiment[yelp.loc[yelp['business_id'] == biz_id, 'name'].iloc[0]] = sentiment[sentiment['business_id'] == biz_id].groupby(['business_id']).mean()

In [323]:
top10_scores = pd.DataFrame()
for restaurant in top10_sentiment:
    top10_scores = top10_scores.append(pd.DataFrame(top10_sentiment[restaurant]))
top10_scores.reset_index(level=0, inplace=True)

In [324]:
top10_scores['name'] = top10_scores['business_id'].apply(lambda business_id: yelp['name']
                                                         [(yelp['business_id'] == business_id)].values[0])

In [325]:
top10_scores

Unnamed: 0,business_id,cool,funny,stars,useful,compound,v_negative,v_neutral,v_positive,name
0,-zEpEmDfFQL-ph0N3BDlXA,0.368852,0.210383,4.368852,0.751366,1.842996,0.023959,0.711115,0.264899,Sakanaya
1,9MnbQg7kfb_WgxoV0hXKSQ,0.3,0.308974,4.410256,0.711538,1.791388,0.036321,0.741422,0.222265,Black Dog Smoke & Ale House
2,-fiUXzkxRfbHY9TKWwuptw,0.231858,0.152212,3.861947,0.60531,1.78237,0.036811,0.733887,0.22932,DESTIHL Restaurant & Brew Works
3,L2c-qKZWumCmOCR-dqBLrg,0.386792,0.333333,3.993711,1.056604,1.822612,0.034164,0.731796,0.234063,Seven Saints
4,t_yiQnxUDdPPCN2z4QyezA,0.559796,0.384224,4.35369,1.709924,1.744223,0.040496,0.761692,0.197796,Golden Harbor Authentic Chinese Cuisine
5,VIJ2KiDKhUVhhpNylEIfog,0.312749,0.304781,4.416335,0.856574,1.776016,0.034815,0.733263,0.23193,Maize Mexican Grill
6,XbHxWOciYlBhJOjKRQbo9g,0.576923,0.673077,3.326923,1.153846,1.467821,0.048712,0.817731,0.133538,Meijer
7,e0prCZXtHGQIKeQ_wTW3uw,0.490909,0.490909,3.981818,1.054545,1.744593,0.043909,0.719648,0.236467,Cafe Kopi
8,o13eH93qmWVNFZogkjhd9w,0.317784,0.241983,3.874636,0.673469,1.746205,0.034915,0.752376,0.212714,Courier Cafe
9,dn9lwYUxmhs_mLKPu7L25Q,0.299383,0.330247,3.450617,1.537037,1.705092,0.047735,0.74246,0.209809,Big Grove Tavern


In [326]:
bottom10_sentiment = {}
for biz_id in bottom10['business_id'].tolist():
    bottom10_sentiment[yelp.loc[yelp['business_id'] == biz_id, 'name'].iloc[0]] = sentiment[sentiment['business_id'] == biz_id].groupby(['business_id']).mean()

In [327]:
bottom10_scores = pd.DataFrame()
for restaurant in bottom10_sentiment:
    bottom10_scores = bottom10_scores.append(pd.DataFrame(bottom10_sentiment[restaurant]))
bottom10_scores.reset_index(level=0, inplace=True)

In [328]:
bottom10_scores['name'] = bottom10_scores['business_id'].apply(lambda business_id: yelp['name']
                                                               [(yelp['business_id'] == business_id)].values[0])

In [329]:
bottom10_scores

Unnamed: 0,business_id,cool,funny,stars,useful,compound,v_negative,v_neutral,v_positive,name
0,IYASJOu_TXz8PpPbt-Clbg,0.0,0.0,2.0,0.75,0.9924,0.124,0.7795,0.09675,Sushi San
1,83_gVj7cnJd0-J5ZoYN9qA,0.0,0.0,1.333333,0.333333,0.743,0.122667,0.801667,0.075667,Papa John's Pizza
2,hjo4IYguwprIJ16A8lV75A,0.2,0.2,4.0,0.8,1.42484,0.0802,0.6736,0.2462,Domino's Pizza
3,W7KtVjq4R_5F5EwbCBENQw,0.0,0.0,4.5,0.75,1.9237,0.01075,0.652,0.33725,Golden Kitchen
4,21UO0mP1EgEDZyFUFFI9Mg,0.0,0.0,4.4,0.0,1.90032,0.0,0.6768,0.3232,The Wild Hare
5,-yZ78Hd2DKDqvxJKbCyELg,1.0,1.0,5.0,1.333333,1.9692,0.002667,0.671667,0.325667,Stango Cuisine
6,Pq_dAtQ4i1wkyWsEiFYrEA,0.166667,0.75,3.25,0.75,1.756233,0.046667,0.728,0.22525,Tortica's Grill
7,gWZOW4-8N5dLixQAlp8iRg,0.28125,0.3125,3.0625,0.90625,1.546503,0.03825,0.775531,0.186188,Manzellas Italian Restaurant
8,tkbRjBlZm7ngEVSu22n0vg,0.333333,0.0,3.666667,0.0,1.618467,0.039667,0.699333,0.261,Jimmy John's


In [330]:
top10_scores.to_csv('data/yelp_top_scores.csv', index=False)
bottom10_scores.to_csv('data/yelp_bottom_scores.csv', index=False)

### What are the top 10 Cuisine types (Mexican, American, Thai, etc) based on the number of restaurants and number of check ins

In [160]:
restaurants = yelp[yelp['categories'].str.contains('Restaurants', na=False)]

In [161]:
pd.options.mode.chained_assignment = None  # default='warn'
restaurants['categories'] = restaurants['categories'].apply(lambda x: x.split(', '))

In [162]:
restaurants = restaurants.join(pd.get_dummies(pd.DataFrame(restaurants['categories'].tolist()).stack()).astype(int).sum(level=0))

In [189]:
columns = [item for item in restaurants.columns.tolist() if item not in yelp.columns.tolist()]
columns.remove('Restaurants')

In [335]:
num_restaurants = restaurants[columns].sum(numeric_only=True).sort_values(ascending=False)
num_restaurants

Food                                3607.0
Nightlife                           2399.0
Bars                                2313.0
Sandwiches                          2166.0
Pizza                               2139.0
Fast Food                           2089.0
American (Traditional)              2083.0
Breakfast & Brunch                  1566.0
Burgers                             1558.0
Italian                             1384.0
Mexican                             1335.0
American (New)                      1308.0
Chinese                             1265.0
Coffee & Tea                         992.0
Cafes                                955.0
Chicken Wings                        809.0
Japanese                             794.0
Salad                                736.0
Seafood                              714.0
Event Planning & Services            655.0
Sushi Bars                           641.0
Canadian (New)                       614.0
Delis                                591.0
Mediterrane

In [233]:
checkin_dict ={}
for item in columns:
    cnt = restaurants[restaurants[item] == 1].groupby([item])['checkins'].sum()
    if cnt.empty:
        checkin_dict[item] = 0
    else:
        checkin_dict[item] = restaurants[restaurants[item] == 1].groupby([item])['checkins'].sum().values[0]

In [236]:
checkin_dict = pd.Series(checkin_dict).sort_values(ascending=False)

In [237]:
checkin_dict

Food                          586066.0
Nightlife                     425034.0
Bars                          412979.0
Sandwiches                    340887.0
Pizza                         324931.0
Fast Food                     323460.0
American (Traditional)        301669.0
Breakfast & Brunch            240952.0
Mexican                       234984.0
Burgers                       232747.0
Chinese                       199874.0
Italian                       194569.0
American (New)                192660.0
Coffee & Tea                  155729.0
Chicken Wings                 139867.0
Japanese                      130470.0
Cafes                         129573.0
Event Planning & Services     120586.0
Salad                         115175.0
Thai                          115110.0
Seafood                       107557.0
Specialty Food                100308.0
Canadian (New)                 99497.0
Sports Bars                    96801.0
Sushi Bars                     96322.0
Caterers                 

In [338]:
for item in checkin_dict.keys():
    print(item)

Food
Nightlife
Bars
Sandwiches
Pizza
Fast Food
American (Traditional)
Breakfast & Brunch
Mexican
Burgers
Chinese
Italian
American (New)
Coffee & Tea
Chicken Wings
Japanese
Cafes
Event Planning & Services
Salad
Thai
Seafood
Specialty Food
Canadian (New)
Sports Bars
Sushi Bars
Caterers
Steakhouses
Pubs
Barbeque
Delis
Desserts
Mediterranean
Asian Fusion
Bakeries
Indian
Beer
Wine & Spirits
Diners
Buffets
Korean
Lounges
Greek
Wine Bars
Cocktail Bars
Vegetarian
Middle Eastern
Gluten-Free
Vietnamese
Soup
French
Ethnic Food
Juice Bars & Smoothies
Comfort Food
Food Trucks
Hot Dogs
Tex-Mex
Gastropubs
Caribbean
Arts & Entertainment
Grocery
Vegan
Food Delivery Services
Ice Cream & Frozen Yogurt
Latin American
Noodles
Food Stands
Chicken Shop
Shopping
Venues & Event Spaces
Tapas Bars
Irish
Bagels
Tapas/Small Plates
Southern
Halal
Breweries
Pakistani
Hawaiian
Ramen
Creperies
Music Venues
Poutineries
Hotels & Travel
Fish & Chips
Health Markets
Party & Event Planning
Dance Clubs
Imported Food
Cajun/Cr

In [337]:
pd.DataFrame(checkin_dict).to_csv('data/yelp_cuisine_checkin.csv')
pd.DataFrame(num_restaurants).to_csv('data/yelp_cuisine_restaurants.csv',)

### What are the most popular keywords or adjectives that reviewers use for the above list of cuisines (calculated in step 8)

# Using Kaggle data

In [None]:
# divvy = pd.read_csv('data/chicago-divvy-bicycle-sharing-data/data_raw.csv', parse_dates=['starttime', 'stoptime'])

In [None]:
# divvy = divvy[divvy['starttime'].dt.year == 2017].reset_index(drop=True)

In [None]:
# divvy.to_csv('data/divvy_2017_kaggle.csv')

In [None]:
divvy = pd.read_csv('data/divvy_2017_kaggle.csv')

In [None]:
divvy

In [None]:
divvy.head()

In [None]:
divvy.columns

### 1) Top 5 stations with the most starts (showing # of starts)

In [None]:
station_starts = divvy.groupby(['from_station_name'])['from_station_name'].count().sort_values(ascending=False)
station_starts.head()

In [None]:
ax = station_starts.head(5).plot(kind='bar', figsize=(15, 10), title='Top 5 Stations with Most Starts')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005 + .15, p.get_height() * 1.005))

In [None]:
station_coord = 

In [None]:
map = Basemap(width=10000000,height=6000000,projection='lcc',
            resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
plt.figure(figsize=(19,20))
map.bluemarble()

In [None]:
for station in station_starts[5]:
        loc = geolocator.geocode(city)
        if not loc:
            print("Could not locate {}".format(city))
            continue
        x, y = map(loc.longitude, loc.latitude)
        map.plot(x,y,marker='o',color='Red',markersize=int(math.sqrt(count))*scale)
        plt.annotate(city, xy = (x,y), xytext=(-20,20)) 

### 2) Trip duration by user type

In [None]:
trip_duration = divvy.groupby(['usertype'])['tripduration'].mean().sort_values(ascending=False)
trip_duration

In [None]:
ax = divvy.boxplot(column='tripduration', by='usertype', figsize=(15,10))

In [None]:
ax = divvy.boxplot(column='tripduration', by='usertype', figsize=(15,10), showfliers=False)

### 3) Most popular trips based on start station and stop station

In [None]:
divvy['trip_stations'] = divvy['from_station_name'] + ' TO ' + divvy['to_station_name']

In [None]:
trip_stations = divvy.groupby(['trip_stations'])['trip_stations'].count().sort_values(ascending=False)
trip_stations.head(10)

In [None]:
ax = trip_stations.head(10).plot(kind='bar', figsize=(15, 10), title='Top 10 Most Popular Trips')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005 + .05, p.get_height() * 1.005))

In [None]:
trip_dict = {'path': list(trip_stations.index), 'frequency': list(trip_stations.values)}
trip_dict['origin'] = [x.split(' TO ')[0] for x in trip_dict['path']]
trip_dict['destination'] = [x.split(' TO ')[1] for x in trip_dict['path']]

In [None]:
from collections import defaultdict

trips = defaultdict(list)

for idx in range(len(trip_dict['path'])):
    trips['path'].append(trip_dict['path'][idx])
    trips['origin-destination'].append('origin')
    trips['station'].append(trip_dict['origin'][idx])
    
    trips['path'].append(trip_dict['path'][idx])
    trips['origin-destination'].append('destination')
    trips['station'].append(trip_dict['destination'][idx])

In [None]:
stations_gps = (divvy[divvy['from_station_name'].duplicated()]
                [['from_station_name', 'latitude_start', 'longitude_start']].reset_index(drop=True))

In [None]:
trips_df = pd.DataFrame(trips).merge(stations_gps, how='left', left_on='station', right_on='from_station_name')

In [None]:
trips_df.to_csv('data/trips.csv')

### 4) Rider performance by Gender and Age based on avg trip distance (station to station), median speed (distance traveled / trip duration)

Multiply geodesic distance by 1.25. Routes follow roads but the calculated route is direct (geodesic). A route straight down a road would be the same as the direct route; a route diagnoal to roads would be multiplied by 1.414 (thanks, Pythagoras!); assuming routes are evenly split between diagonal and direct, with some wiggle room, I'm splitting the difference at 1.25.

I looked at using the Google Maps api to calculate the actual, along-the-road distance, but they've removed the free api key option. I also looked at Bing Maps, but it's rate limited and I have more than 98,000 routes in this dataset (and once I saw how big that number was, I realized that using api calls would take more than a few days!). So I opted for this *x1.25* method which is less accurate but far quicker and cheaper.

In [None]:
def find_distance(row):
    return (1.25 * (geopy.distance.distance((row['latitude_start'], row['longitude_start']),
                                            (row['latitude_end'], row['longitude_end'])).m))

In [None]:
divvy['trip_distance'] = divvy.apply(find_distance, axis=1)

In [None]:
divvy['speed'] = divvy['trip_distance'] / divvy['tripduration']

In [None]:
divvy[['starttime', 'stoptime', 'tripduration', 'latitude_start', 'longitude_start', 'latitude_end', 'longitude_end', 'trip_distance', 'speed']]