In [1]:
import requests
import urllib3
import json
import csv
import os
import datetime

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
pd.set_option('display.max_columns', 83)
pd.set_option('display.max_colwidth', 25)
pd.set_option('display.max_rows', None)
pd.set_option("display.precision", 2)
sns.set(rc={'figure.figsize':(12.7,8.27)})

In [5]:
cd = os.path.abspath(os.getcwd())
with open(f'{cd}/not_needed/config_personal.json') as f:
    data = json.load(f)

### getting activities from strava api

In [6]:
auth_url = "https://www.strava.com/oauth/token"
activites_url = "https://www.strava.com/api/v3/athlete/activities"
payload = data['payload']
res = requests.post(auth_url, data=payload, verify=False)
header = {'Authorization': 'Bearer ' + res.json()['access_token']}

my_activities = pd.DataFrame()
for page in range(1,7):
    my_dataset = requests.get(activites_url, headers=header, params={'per_page': 200, 'page': page}).json()
    my_dataframe = pd.json_normalize(my_dataset)
    my_activities = pd.concat([my_activities, my_dataframe], ignore_index=True)

In [7]:
my_activities.shape

(1149, 57)

##### write raw activities/ read them again and convert some columns
#### get runs and walks, remove NaNs/write them also back
#### last but not least write cleaned_activities back.

In [8]:
with open(f'{cd}/csvs/raw/all_activities_raw.csv', 'w') as f:
    my_activities.to_csv(f)

In [9]:
with open(f'{cd}/csvs/raw/all_activities_raw.csv') as f:
    my_activities = (pd.read_csv(f)
                    .rename(columns=str.lower)
                    .drop(['unnamed: 0', 'resource_state'], axis = 1)
                    .rename(columns={
                        'average_speed' : 'average_speed_mps', 
                        'max_speed' : 'max_speed_mps',
                        'moving_time' : 'moving_time_s',
                        'elapsed_time' : 'elapsed_time_s'})
                    .assign(start_date_local= lambda x: pd.to_datetime(x['start_date_local']),
                            start_time= lambda x: x['start_date_local'].dt.time)
                    .assign(start_date_local = lambda x: x['start_date_local'].dt.strftime("%Y/%m/%d"),
                            timezone = lambda x: pd.Categorical(x['timezone'].str.split(' ').str[-1],))
                    .assign(start_date_local = lambda x: pd.to_datetime(x['start_date_local']),
                            start_day_name = lambda x: pd.Categorical(x['start_date_local'].dt.strftime("%A")),
                            moving_time_min = lambda x: pd.to_datetime(x['moving_time_s'], unit='s').dt.strftime('%H:%M:%S'),
                            average_speed_kmh = lambda x: x['average_speed_mps'] * (18/5),
                            max_speed_kmh = lambda x: x['max_speed_mps'] * (18/5),
                            distance_km = lambda x: x[(x['type'] == 'Run') | (x['type'] == 'Walk')]['distance']/1000,
                            visibility = lambda x: pd.Categorical(x['visibility']),
                            name = lambda x: pd.Categorical(x['name']),
                            type = lambda x: pd.Categorical(x['type'])))

In [10]:
my_activities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149 entries, 0 to 1148
Data columns (total 62 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   name                           1149 non-null   category      
 1   distance                       1149 non-null   float64       
 2   moving_time_s                  1149 non-null   int64         
 3   elapsed_time_s                 1149 non-null   int64         
 4   total_elevation_gain           1149 non-null   float64       
 5   type                           1149 non-null   category      
 6   id                             1149 non-null   int64         
 7   external_id                    1149 non-null   object        
 8   upload_id                      1149 non-null   int64         
 9   start_date                     1149 non-null   object        
 10  start_date_local               1149 non-null   datetime64[ns]
 11  timezone         

In [11]:
runs_maps = my_activities[my_activities['type'] == 'Run'][['id', 'map.summary_polyline']]
walks_maps = my_activities[my_activities['type'] == 'Walk'][['id', 'map.summary_polyline']]
runs_maps = runs_maps.dropna()
walks_maps = walks_maps.dropna()

In [12]:
with open(f'{cd}/csvs/clean/runs.csv', 'w') as runs_file, open(f'{cd}/csvs/clean/walks.csv', 'w') as walks_file:
    runs_maps.to_csv(runs_file)
    walks_maps.to_csv(walks_file)

In [13]:
cols = ['upload_id', 'name', 'type', 'distance_km', 'moving_time_min','start_time',  'start_date_local', 'start_day_name', 
        'timezone','average_speed_kmh', 'max_speed_kmh', 'total_elevation_gain', 'average_heartrate',
        'max_heartrate', 'achievement_count', 'kudos_count', 'visibility']
my_acts = my_activities[cols]

In [14]:
my_acts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149 entries, 0 to 1148
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   upload_id             1149 non-null   int64         
 1   name                  1149 non-null   category      
 2   type                  1149 non-null   category      
 3   distance_km           536 non-null    float64       
 4   moving_time_min       1149 non-null   object        
 5   start_time            1149 non-null   object        
 6   start_date_local      1149 non-null   datetime64[ns]
 7   start_day_name        1149 non-null   category      
 8   timezone              1149 non-null   category      
 9   average_speed_kmh     1149 non-null   float64       
 10  max_speed_kmh         1149 non-null   float64       
 11  total_elevation_gain  1149 non-null   float64       
 12  average_heartrate     1078 non-null   float64       
 13  max_heartrate     

In [15]:
with open(f'{cd}/csvs/clean/all_activities.csv', 'w') as activs_f:
    my_acts.to_csv(activs_f)

##### save cleaned activities

In [None]:
with open(f'{cd}/csvs/clean/all_activities.csv', 'r') as activs_f:
    activities = pd.read_csv(activs_f, delimiter=',' , encoding='iso-8859-1', index_col=[0])

In [None]:
run = activities.loc[activities['type'] == 'Run']
walk = activities.loc[activities['type'] == 'Walk']
run_5_more = run[run['distance_km'] > 5]

#### Countplots

In [None]:
sns.countplot(x='type',
              data=activities)

In [None]:
sns.countplot(x='start_day_name', data=run)

In [None]:
sns.countplot(x='start_day_name', data=walk)

In [None]:
sns.countplot(x='name', 
              data=run, 
              order=pd.value_counts(run['name']).iloc[:4].index)
locs, labels = plt.xticks(rotation=45, fontsize=8)
plt.title("Different named runs and their couns")
plt.xlabel("Name of run")
plt.ylabel("Count")

#### Relplots

In [None]:
sns.relplot(x='start_day_name', y='distance_km', hue='name',
              data=run_5_more
           )
locs, labels = plt.xticks(rotation=45, fontsize=8)
plt.title("Different days of week and distances")
plt.xlabel("Days")
plt.ylabel("Distance")

In [None]:
# Find max for each day
sns.relplot(x='start_day_name', y='distance_km', 
            kind='line', ci=None,
            data=run)
locs, labels = plt.xticks(rotation=45, fontsize=15)
plt.title("Different days of week and distances")
plt.xlabel("Days")
plt.ylabel("Distance")

In [None]:
sns.relplot(x='distance_km', y='moving_time(min)', hue='start_day_name',
            col='start_day_name', col_wrap=3, data=run)

#### Pointplots

In [None]:
sns.pointplot(x='start_day_name', y='distance_km',  data=run_5_more)
plt.title("Different days of week and distances 5km and more")
plt.xlabel("Days")
plt.ylabel("Distance")

#### Violinplots

In [None]:
sns.violinplot(x='start_day_name', y='distance_km', palette='muted', data=run)
plt.title("Different days of week and distances")
plt.xlabel("Days")
plt.ylabel("Distance")

#### Scatterplots + FacetGrid

In [None]:
sns.scatterplot(x='distance_km',
                y='moving_time(min)',
 b               data=run_5_more)
locs,label = plt.xticks(rotation=90, fontsize=3)

In [None]:
g = sns.FacetGrid(run, col='start_day_name', col_wrap=2)
g = g.map(sns.scatterplot, 'distance_km', 'max_speed_kmh')
#plt.title("Different days of week and distances")


#### Regplots

In [None]:
sns.set(style="ticks", context="talk")
sns.regplot(x="distance_km", y="average_speed_kmh", data=run).set_title("Average Speed vs Distance")

In [None]:
sns.set(style="ticks", context="talk")
sns.regplot(x='distance_km', y='max_speed_kmh', data=run).set_title("Max Speed vs Distance")

In [None]:
sns.boxplot(x='name',y='distance_km',data=run_5_more)
locs, labels = plt.xticks(rotation=45, fontsize=12)

#### Distplots

In [None]:
sns.displot(run['distance_km'])