In [1]:
import requests
import urllib3
import json
import csv
import os
import datetime

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy

from typing import Tuple

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
pd.set_option('display.max_columns', 83)
pd.set_option('display.max_colwidth', 25)
pd.set_option('display.max_rows', None)
pd.set_option("display.precision", 2)
sns.set(rc={'figure.figsize':(12.7,8.27)})

In [3]:
# Type Aliases
DataFrameTuple = Tuple[pd.DataFrame, pd.DataFrame]

In [4]:
cd = os.path.abspath(os.getcwd())
with open(f'{cd}/not_needed/config_personal.json') as f:
    data = json.load(f)

In [5]:
def retrieve_activities() -> pd.DataFrame:
    auth_url = "https://www.strava.com/oauth/token"
    activites_url = "https://www.strava.com/api/v3/athlete/activities"
    payload = data['payload']
    res = requests.post(auth_url, data=payload, verify=False)
    header = {'Authorization': 'Bearer ' + res.json()['access_token']}

    my_activities = pd.DataFrame()
    for page in range(1,7):
        my_dataset = requests.get(activites_url, headers=header, params={'per_page': 200, 'page': page}).json()
        my_dataframe = pd.json_normalize(my_dataset)
        my_activities = pd.concat([my_activities, my_dataframe], ignore_index=True)
    return my_activities

In [6]:
def save_raw_activities(my_activities: pd.DataFrame) -> None:
    my_activities.to_csv(f'{cd}/csvs/raw/all_activities_raw.csv')

In [7]:
def structure_activities() -> pd.DataFrame:
        my_activities = (pd.read_csv(f'{cd}/csvs/raw/all_activities_raw.csv')
                        .rename(columns=str.lower)
                        .drop(['unnamed: 0', 'resource_state'], axis = 1)
                        .rename(columns={
                                'average_speed' : 'average_speed_mps', 
                                'max_speed' : 'max_speed_mps',
                                'moving_time' : 'moving_time_s',
                                'elapsed_time' : 'elapsed_time_s'})
                        .assign(start_date_local= lambda x: pd.to_datetime(x['start_date_local']),
                                start_time= lambda x: x['start_date_local'].dt.time)
                        .assign(start_date_local = lambda x: x['start_date_local'].dt.strftime("%Y/%m/%d"),
                                timezone = lambda x: pd.Categorical(x['timezone'].str.split(' ').str[-1],))
                        .assign(start_date_local = lambda x: pd.to_datetime(x['start_date_local']),
                                start_day_name = lambda x: pd.Categorical(x['start_date_local'].dt.strftime("%A")),
                                moving_time_min = lambda x: pd.to_datetime(x['moving_time_s'], unit='s').dt.strftime('%H:%M:%S'),
                                average_speed_kmh = lambda x: x['average_speed_mps'] * (18/5),
                                max_speed_kmh = lambda x: x['max_speed_mps'] * (18/5),
                                distance_km = lambda x: x[(x['type'] == 'Run') | (x['type'] == 'Walk')]['distance']/1000,
                                visibility = lambda x: pd.Categorical(x['visibility']),
                                name = lambda x: pd.Categorical(x['name']),
                                type = lambda x: pd.Categorical(x['type'])))
        return my_activities

In [8]:
def split_into_run_walk(my_activities: pd.DataFrame) -> DataFrameTuple:
    runs_maps = my_activities[my_activities['type'] == 'Run'][['id', 'map.summary_polyline']]
    walks_maps = my_activities[my_activities['type'] == 'Walk'][['id', 'map.summary_polyline']]
    runs_maps = runs_maps.dropna()
    walks_maps = walks_maps.dropna()
    return runs_maps, walks_maps

In [9]:
def save_runs_walks(runs_maps: pd.DataFrame, walks_maps: pd.DataFrame) -> None:
    runs_maps.to_csv(f'{cd}/csvs/clean/runs.csv')
    walks_maps.to_csv(f'{cd}/csvs/clean/walks.csv')

In [10]:
def clean_up_activities(my_activities: pd.DataFrame) -> pd.DataFrame:
    cols = ['upload_id', 'name', 'type', 'distance_km', 'moving_time_min','start_time',  'start_date_local', 'start_day_name', 
            'timezone','average_speed_kmh', 'max_speed_kmh', 'total_elevation_gain', 'average_heartrate',
            'max_heartrate', 'achievement_count', 'kudos_count', 'visibility']
    my_acts = my_activities[cols]
    return my_acts

In [11]:
def save_cleaned_activities(my_acts: pd.DataFrame) -> None:
    my_acts.to_csv(f'{cd}/csvs/clean/all_activities.csv')

In [12]:
my_activities_raw = retrieve_activities()
save_raw_activities(my_activities_raw)

In [13]:
my_activities = structure_activities()
runs_maps, walks_maps = split_into_run_walk(my_activities)
save_runs_walks(runs_maps, walks_maps)

In [14]:
activities_lite = clean_up_activities(my_activities)
save_cleaned_activities(activities_lite)

In [19]:
activities = pd.read_csv(f'{cd}/csvs/clean/all_activities.csv', delimiter=',' , encoding='iso-8859-1', index_col=[0])

In [20]:
run = activities.loc[activities['type'] == 'Run']
walk = activities.loc[activities['type'] == 'Walk']
run_5_more = run[run['distance_km'] > 5]

## PLOTTING

#### Countplots

In [None]:
sns.countplot(x='type',
              data=activities)

In [None]:
sns.countplot(x='start_day_name', data=run)

In [None]:
sns.countplot(x='start_day_name', data=walk)

In [None]:
sns.countplot(x='name', 
              data=run, 
              order=pd.value_counts(run['name']).iloc[:4].index)
locs, labels = plt.xticks(rotation=45, fontsize=8)
plt.title("Different named runs and their couns")
plt.xlabel("Name of run")
plt.ylabel("Count")

#### Relplots

In [None]:
sns.relplot(x='start_day_name', y='distance_km', hue='name',
              data=run_5_more
           )
locs, labels = plt.xticks(rotation=45, fontsize=8)
plt.title("Different days of week and distances")
plt.xlabel("Days")
plt.ylabel("Distance")

In [None]:
# Find max for each day
sns.relplot(x='start_day_name', y='distance_km', 
            kind='line', ci=None,
            data=run)
locs, labels = plt.xticks(rotation=45, fontsize=15)
plt.title("Different days of week and distances")
plt.xlabel("Days")
plt.ylabel("Distance")

In [None]:
sns.relplot(x='distance_km', y='moving_time(min)', hue='start_day_name',
            col='start_day_name', col_wrap=3, data=run)

#### Pointplots

In [None]:
sns.pointplot(x='start_day_name', y='distance_km',  data=run_5_more)
plt.title("Different days of week and distances 5km and more")
plt.xlabel("Days")
plt.ylabel("Distance")

#### Violinplots

In [None]:
sns.violinplot(x='start_day_name', y='distance_km', palette='muted', data=run)
plt.title("Different days of week and distances")
plt.xlabel("Days")
plt.ylabel("Distance")

#### Scatterplots + FacetGrid

In [None]:
sns.scatterplot(x='distance_km',
                y='moving_time(min)',
 b               data=run_5_more)
locs,label = plt.xticks(rotation=90, fontsize=3)

In [None]:
g = sns.FacetGrid(run, col='start_day_name', col_wrap=2)
g = g.map(sns.scatterplot, 'distance_km', 'max_speed_kmh')
#plt.title("Different days of week and distances")


#### Regplots

In [None]:
sns.set(style="ticks", context="talk")
sns.regplot(x="distance_km", y="average_speed_kmh", data=run).set_title("Average Speed vs Distance")

In [None]:
sns.set(style="ticks", context="talk")
sns.regplot(x='distance_km', y='max_speed_kmh', data=run).set_title("Max Speed vs Distance")

In [None]:
sns.boxplot(x='name',y='distance_km',data=run_5_more)
locs, labels = plt.xticks(rotation=45, fontsize=12)

#### Distplots

In [None]:
sns.displot(run['distance_km'])