In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("c2_file.csv")
df

Unnamed: 0,a,b,c,d
0,yellow,10,2,3.2
1,green,2,3,8.1
2,blue,7,1,0.4


In [3]:
pd.read_csv("c2_file.csv", header=None)

Unnamed: 0,0,1,2,3
0,a,b,c,d
1,yellow,10,2,3.2
2,green,2,3,8.1
3,blue,7,1,0.4


In [4]:
pd.read_csv("c2_file.csv", names=["column 1", "column 2", "column 3", "column 4"])

Unnamed: 0,column 1,column 2,column 3,column 4
0,a,b,c,d
1,yellow,10,2,3.2
2,green,2,3,8.1
3,blue,7,1,0.4


In [5]:
df.dtypes

a     object
b      int64
c      int64
d    float64
dtype: object

In [6]:
# force data types when importing data

df2 = pd.read_csv("c2_file.csv", dtype={"b": np.float64})
df2.dtypes

a     object
b    float64
c      int64
d    float64
dtype: object

In [7]:
# import partial data

pd.read_csv("c2_file.csv", usecols=["a", "b"])

Unnamed: 0,a,b
0,yellow,10
1,green,2
2,blue,7


In [8]:
pd.read_excel("c2_data.xls")

Unnamed: 0,varA,varB,varC
0,0.391723,-0.155122,0.381104
1,0.575125,-0.105817,0.232245
2,0.672305,0.424688,-0.694795
3,0.766115,-0.79135,-0.028739
4,0.677259,-0.817543,-0.537088
5,-0.029702,-0.891848,-0.682719
6,-0.161366,-0.6596,-0.727898
7,0.031672,0.016607,-0.940479
8,0.833212,-0.503236,-0.88721
9,0.907753,0.265177,-0.390762


In [9]:
# specify import from different sheet

pd.read_excel("c2_data.xls", sheet_name="Sheet2")

Unnamed: 0,varD,varE,varF
0,0.907753,0.265177,-0.390762
1,0.755019,-0.768056,-0.528307
2,0.850692,-0.537159,-0.601387
3,0.131663,0.941327,0.240073
4,0.5744,0.091735,-0.395277
5,0.81663,0.875612,-0.880044
6,0.536732,0.175428,-0.473053
7,-0.084641,-0.042827,0.053344
8,0.268271,-0.010628,-0.090952
9,0.166792,-0.872579,-0.556899


In [10]:
pd.read_json("c2_frame.json")

Unnamed: 0,col1,col2,col3,col4
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15


In [11]:
# # import java file format

import json
from pandas import json_normalize

In [12]:
with open("c2_books.json", "r") as f:
    json_string = f.read()
    dictionary = json.loads(json_string)

In [13]:
json_normalize(dictionary, 'books')

Unnamed: 0,isbn,title,subtitle,author,published,publisher,pages,description,website
0,9781593275846,"Eloquent JavaScript, Second Edition",A Modern Introduction to Programming,Marijn Haverbeke,2014-12-14T00:00:00.000Z,No Starch Press,472,JavaScript lies at the heart of almost every m...,http://eloquentjavascript.net/
1,9781449331818,Learning JavaScript Design Patterns,A JavaScript and jQuery Developer's Guide,Addy Osmani,2012-07-01T00:00:00.000Z,O'Reilly Media,254,"With Learning JavaScript Design Patterns, you'...",http://www.addyosmani.com/resources/essentialj...
2,9781449365035,Speaking JavaScript,An In-Depth Guide for Programmers,Axel Rauschmayer,2014-02-01T00:00:00.000Z,O'Reilly Media,460,"Like it or not, JavaScript is everywhere these...",http://speakingjs.com/


In [14]:
# html / web scraping

import requests

In [15]:
page = requests.get(
    "https://web.archive.org/web/20180908144902/en.proverbia.net/shortfamousquotes.asp"
)

In [16]:
page.text[0:100]

'\n<!DOCTYPE html>\n\n<html lang="en" xml:lang="en">\n<head><script src="//archive.org/includes/analytics'

In [17]:
page.status_code

200

In [18]:
from bs4 import BeautifulSoup

In [19]:
soup = BeautifulSoup(page.text, "html.parser")

In [20]:
# identify patterns in the html code that allude to required data in this case blockquote

quotes = soup.find_all("blockquote")

In [21]:
quotes

[<blockquote>There is a natural aristocracy among men. The grounds of this are virtue and talents. </blockquote>,
 <blockquote>All our words from loose using have lost their edge. </blockquote>,
 <blockquote>God couldn't be everywhere, so he created mothers </blockquote>,
 <blockquote>Be not afraid of going slowly, be afraid only of standing still. </blockquote>,
 <blockquote>Learn from yesterday, live for today, hope for tomorrow. </blockquote>,
 <blockquote>Do not confine your children to your own learning, for they were born in another time. </blockquote>,
 <blockquote>I hear and I forget, I see and I remember. I do and I understand. </blockquote>,
 <blockquote>In teaching others we teach ourselves. </blockquote>,
 <blockquote>Happiness will never come to those who fail to appreciate what they already have. </blockquote>,
 <blockquote>Without His love I can do nothing, with His love there is nothing I cannot do. </blockquote>]

In [22]:
# since object quotes behaves like python list can iterate over it

quotes[0].text

# text attribute extracts text and returns string object

'There is a natural aristocracy among men. The grounds of this are virtue and talents. '

In [23]:
# now run a loop to extract all

quote_list= []
for quote in quotes:
    string = quote.text
    quote_list.append(string)

In [24]:
# now create dataframe with

df = pd.DataFrame(quote_list, columns=["Quote"])
df

Unnamed: 0,Quote
0,There is a natural aristocracy among men. The ...
1,All our words from loose using have lost their...
2,"God couldn't be everywhere, so he created moth..."
3,"Be not afraid of going slowly, be afraid only ..."
4,"Learn from yesterday, live for today, hope for..."
5,Do not confine your children to your own learn...
6,"I hear and I forget, I see and I remember. I d..."
7,In teaching others we teach ourselves.
8,Happiness will never come to those who fail to...
9,"Without His love I can do nothing, with His lo..."


In [25]:
authors = soup.find_all("p", class_="a")

In [26]:
authors[0].text

'\nThomas Jefferson (1743-1826) Third president of the United States.\n'

In [27]:
authors[0].text[1:-1]

'Thomas Jefferson (1743-1826) Third president of the United States.'

In [28]:
author_list = []
for author in authors:
    string = author.text[1:-1]
    author_list.append(string)
df["Author"] = author_list
df

Unnamed: 0,Quote,Author
0,There is a natural aristocracy among men. The ...,Thomas Jefferson (1743-1826) Third president o...
1,All our words from loose using have lost their...,Ernest Hemingway (1898-1961) American Writer.
2,"God couldn't be everywhere, so he created moth...",Jewish proverb
3,"Be not afraid of going slowly, be afraid only ...",Chinese proverb
4,"Learn from yesterday, live for today, hope for...",Unknown Source
5,Do not confine your children to your own learn...,Chinese proverb
6,"I hear and I forget, I see and I remember. I d...",Chinese proverb
7,In teaching others we teach ourselves.,Proverb
8,Happiness will never come to those who fail to...,Unknown Source
9,"Without His love I can do nothing, with His lo...",Unknown Source


In [29]:
# web scrape tables directly

tables = pd.read_html("https://world.openfoodfacts.org/additives")
print(len(tables)) # 1
print(tables[0].head())

1
                   Additive  Products    * Risk
0        E330 - Citric acid    140296  NaN  NaN
1          E322 - Lecithins     96492  NaN  NaN
2          E322i - Lecithin     87325  NaN  NaN
3  E500 - Sodium carbonates     59691  NaN  NaN
4        E415 - Xanthan gum     52513  NaN  NaN


In [30]:
tables = pd.read_html(
    "https://en.wikipedia.org/wiki/World_record_progression_50_metres_freestyle"
)
print(len(tables))

9


In [31]:
print(tables[4].head())

   Pos   Time                   Swimmer              Date          Venue   Ref
0    1  20.91         Cesar Cielo (BRA)  17 December 2009         Brazil   NaN
1    2  20.94  Frederick Bousquet (FRA)     22 April 2009         France   NaN
2    3  21.04      Caeleb Dressel (USA)      27 July 2019    South Korea   NaN
3    3  21.04      Caeleb Dressel (USA)      20 June 2021          Omaha  [19]
4    4  21.11      Benjamin Proud (GBR)     3 August 2018  Great Britain   NaN


In [32]:
# only switzerland!!

tables = pd.read_html(
    "https://en.wikipedia.org/wiki/World_record_progression_50_metres_freestyle",
    match="Switzerland",
)
print(len(tables))  # 1
print(tables[0][10:15][["Time", "Name", "Nationality"]])

1
     Time          Name    Nationality
10  22.54   Robin Leamy  United States
11  22.52  Dano Halsall    Switzerland
12  22.40     Tom Jager  United States
13  22.33   Matt Biondi  United States
14  22.33   Matt Biondi  United States


In [33]:
with open('client-credentials.json') as file:
    client_credentials = json.load(file)
    
print('Credentials:', list(client_credentials.keys()))

Credentials: ['client_id', 'client_secret']


In [34]:
print(client_credentials['client_id'])

79974


In [35]:
oauth_params = {
    'client_id': client_credentials['client_id'],
    'scope': 'read_all,profile:read_all,activity:read_all',
    'redirect_uri': 'https://localhost',
    'response_type': 'code'
}

In [36]:
from urllib.parse import urlencode

# Generate link that users can copy/paste in their browser to authorize our app
print('https://www.strava.com/oauth/authorize' + '?' + urlencode(oauth_params))

https://www.strava.com/oauth/authorize?client_id=79974&scope=read_all%2Cprofile%3Aread_all%2Cactivity%3Aread_all&redirect_uri=https%3A%2F%2Flocalhost&response_type=code


In [37]:
from getpass import getpass

# After authorizing the app, user is redirected to
authorization_response = getpass(prompt='Full callback URL')

Full callback URL ······························································································································


In [38]:
from urllib.parse import urlparse, parse_qs

# Extract Authorization Code from URL
authorization_code = parse_qs(urlparse(authorization_response).query)['code'][0]

In [39]:
urlparse(authorization_response).query

'state=&code=731b268522215a3d49483a64b815413797a543e8&scope=read,activity:read_all,profile:read_all,read_all'

In [40]:
parse_qs(urlparse(authorization_response).query)
# Returns
# {'code': ['...'],
#  'scope': ['read,activity:read_all,profile:read_all,read_all']}

{'code': ['731b268522215a3d49483a64b815413797a543e8'],
 'scope': ['read,activity:read_all,profile:read_all,read_all']}

In [41]:
print(authorization_code)

731b268522215a3d49483a64b815413797a543e8


GET queries to list the entries
POST queries to create new entries
PUT queries to update existing entries

In [42]:
import requests

# Exchange Authorization Code for Access Token
r = requests.post('https://www.strava.com/oauth/token', data={
    'client_id': client_credentials['client_id'],
    'client_secret': client_credentials['client_secret'],
    'code': authorization_code,
    'grant_type': 'authorization_code'
})
r.status_code # 200

200

In [43]:
print(r.text)

{"token_type":"Bearer","expires_at":1648140803,"expires_in":20773,"refresh_token":"02ab9f77ac33be4acb712d31da40c065dec427fc","access_token":"c3f8be9d3a98bce2e201d90f4f17be1d78806c8e","athlete":{"id":30808165,"username":null,"resource_state":2,"firstname":"Robert","lastname":"Whelan","bio":null,"city":null,"state":null,"country":null,"sex":"M","premium":false,"summit":false,"created_at":"2018-05-14T09:07:39Z","updated_at":"2022-03-22T10:30:17Z","badge_type_id":0,"weight":0.0,"profile_medium":"https://lh5.googleusercontent.com/-pUx9kgGUeT8/AAAAAAAAAAI/AAAAAAAAAAA/AMZuuckxBDA9djO_nteCfrsXBNLuuzkN9Q/photo.jpg","profile":"https://lh5.googleusercontent.com/-pUx9kgGUeT8/AAAAAAAAAAI/AAAAAAAAAAA/AMZuuckxBDA9djO_nteCfrsXBNLuuzkN9Q/photo.jpg","friend":null,"follower":null}}


In [44]:
r.json()

{'token_type': 'Bearer',
 'expires_at': 1648140803,
 'expires_in': 20773,
 'refresh_token': '02ab9f77ac33be4acb712d31da40c065dec427fc',
 'access_token': 'c3f8be9d3a98bce2e201d90f4f17be1d78806c8e',
 'athlete': {'id': 30808165,
  'username': None,
  'resource_state': 2,
  'firstname': 'Robert',
  'lastname': 'Whelan',
  'bio': None,
  'city': None,
  'state': None,
  'country': None,
  'sex': 'M',
  'premium': False,
  'summit': False,
  'created_at': '2018-05-14T09:07:39Z',
  'updated_at': '2022-03-22T10:30:17Z',
  'badge_type_id': 0,
  'weight': 0.0,
  'profile_medium': 'https://lh5.googleusercontent.com/-pUx9kgGUeT8/AAAAAAAAAAI/AAAAAAAAAAA/AMZuuckxBDA9djO_nteCfrsXBNLuuzkN9Q/photo.jpg',
  'profile': 'https://lh5.googleusercontent.com/-pUx9kgGUeT8/AAAAAAAAAAI/AAAAAAAAAAA/AMZuuckxBDA9djO_nteCfrsXBNLuuzkN9Q/photo.jpg',
  'friend': None,
  'follower': None}}

In [45]:
# Token saver
def token_saver(token_obj):
    with open('token.json', 'w') as file:
        json.dump(token_obj, file, indent=4)

token_saver(r.json())

In [46]:
# Token loader
def get_token():
    with open('token.json', 'r') as file:
        return json.load(file)

token = get_token()
token.keys() # 'token_type', 'expires_at', 'expires_in', 'refresh_token', 'access_token', 'athlete'

dict_keys(['token_type', 'expires_at', 'expires_in', 'refresh_token', 'access_token', 'athlete'])

In [47]:
print('Expires in:', token['expires_in']) # initially: 21600 (6 hours)
print('Expires at:', token['expires_at']) # in seconds

Expires in: 20773
Expires at: 1648140803


In [48]:
from datetime import datetime, timedelta

print('Expires at:', datetime.fromtimestamp(token['expires_at'])) # date, time
print('Expires in:', timedelta(seconds=token['expires_in'])) # time delta

Expires at: 2022-03-24 16:53:23
Expires in: 5:46:13


In [49]:
# Refresh expired Access Tokens
r = requests.post('https://www.strava.com/oauth/token', data={
    'client_id': client_credentials['client_id'],
    'client_secret': client_credentials['client_secret'],
    'refresh_token': token['refresh_token'],
    'grant_type': 'refresh_token'
})
token_saver(r.json())
token = get_token()

In [50]:
# List activities
r = requests.get('https://www.strava.com/api/v3/athlete/activities', params={
    'access_token': token['access_token']
})
r.status_code # 200

200

In [51]:
# Save activities
with open('activities.json', 'w') as file:
    json.dump(r.json(), file, indent=4)

In [52]:
# Load data into DataFrame
activities_df = pd.read_json(r.text)
activities_df[['name', 'type', 'distance', 'elapsed_time', 'max_speed']]

Unnamed: 0,name,type,distance,elapsed_time,max_speed
0,Afternoon Run,Run,16995.0,5650,6.474
1,Afternoon Run,Run,11216.1,3618,7.866
2,Box hill loop,Ride,84846.7,14453,15.654
3,Geneva half,Run,20210.2,6924,5.774
4,Afternoon Run,Run,11243.5,3607,7.356
5,Cycling back from the theatre of disappointment,Ride,34752.3,6290,13.188
6,Cycling up to the theatre of dreams,Ride,31953.2,5819,13.056
7,Afternoon Run,Run,8418.7,2695,6.212
8,Afternoon Ride,Ride,22388.4,3593,11.614
9,Morning Ride,Ride,55184.6,8865,13.236


In [53]:
from requests_oauthlib import OAuth2Session

In [54]:
# Create a session for initialization
init_session = OAuth2Session(
    client_credentials['client_id'],
    redirect_uri='https://localhost',
    scope='read_all,profile:read_all,activity:read_all'
)

In [55]:
# Get authorization link
user_link, state = init_session.authorization_url('https://www.strava.com/oauth/authorize')
print('Visit link:', user_link)

Visit link: https://www.strava.com/oauth/authorize?response_type=code&client_id=79974&redirect_uri=https%3A%2F%2Flocalhost&scope=read_all%2Cprofile%3Aread_all%2Cactivity%3Aread_all&state=xOWs2kB1J03q1i9ktAtrEoIczRDIhF


In [56]:
authorization_response = getpass(prompt='Full callback URL')

Full callback URL ····························································································································································


In [57]:
# Get Access Token
token = init_session.fetch_token(
    'https://www.strava.com/oauth/token',
    authorization_response=authorization_response,
    include_client_id=True,
    client_secret=client_credentials['client_secret']
)

In [58]:
token_saver(token)

In [59]:
# Create a session for reaching the API
api_session = OAuth2Session(
    client_credentials['client_id'],
    token=token, # pass Access Token
    
    # Automatically refresh expired token
    auto_refresh_url='https://www.strava.com/oauth/token',
    auto_refresh_kwargs={
        'client_id': client_credentials['client_id'],
        'client_secret': client_credentials['client_secret']
    },
    token_updater=token_saver # automatically saves new tokens
)

In [60]:
# List activities
r = api_session.get('https://www.strava.com/api/v3/athlete/activities')
r.status_code # 200

200

In [61]:
activities_df = pd.read_json(r.text)
activities_df[['name', 'type', 'distance', 'elapsed_time', 'max_speed']]

Unnamed: 0,name,type,distance,elapsed_time,max_speed
0,Afternoon Run,Run,16995.0,5650,6.474
1,Afternoon Run,Run,11216.1,3618,7.866
2,Box hill loop,Ride,84846.7,14453,15.654
3,Geneva half,Run,20210.2,6924,5.774
4,Afternoon Run,Run,11243.5,3607,7.356
5,Cycling back from the theatre of disappointment,Ride,34752.3,6290,13.188
6,Cycling up to the theatre of dreams,Ride,31953.2,5819,13.056
7,Afternoon Run,Run,8418.7,2695,6.212
8,Afternoon Ride,Ride,22388.4,3593,11.614
9,Morning Ride,Ride,55184.6,8865,13.236


In [62]:
from stravalib import Client

# Create client
client = Client()

# Get Authorization URL
user_link = client.authorization_url(
    client_id=client_credentials['client_id'],
    redirect_uri='https://localhost',
    scope=['read_all', 'profile:read_all', 'activity:read_all']
)
print('Visit link:', user_link)

Visit link: https://www.strava.com/oauth/authorize?client_id=79974&redirect_uri=https%3A%2F%2Flocalhost&approval_prompt=auto&response_type=code&scope=read_all%2Cprofile%3Aread_all%2Cactivity%3Aread_all


In [63]:
authorization_response = getpass(prompt='Full callback URL')
authorization_code = parse_qs(urlparse(authorization_response).query)['code'][0]

Full callback URL ······························································································································


In [64]:
# Get Access Token
token = client.exchange_code_for_token(
    client_id=client_credentials['client_id'],
    client_secret=client_credentials['client_secret'],
    code=authorization_code)
token_saver(token)

In [65]:
import time

# Refresh token if necessary
if time.time() > token['expires_at']:
    token = client.refresh_access_token(
        client_id=client_credentials['client_id'],
        client_secret=client_credentials['client_secret'],
        refresh_token=token['refresh_token'])
    token_saver(token)

In [66]:
# Get activities
activities = client.get_activities(limit=5)
activities # <BatchedResultsIterator entity=Activity>

<BatchedResultsIterator entity=Activity>

In [67]:
for activity in activities:
    print(activity)

<Activity id=6871657838 name='Afternoon Run' resource_state=2>
<Activity id=6854579155 name='Afternoon Run' resource_state=2>
<Activity id=6848060758 name='Box hill loop' resource_state=2>
<Activity id=6811083130 name='Geneva half ' resource_state=2>
<Activity id=6793534331 name='Afternoon Run' resource_state=2>


In [68]:
a = list(activities)[0]

print('Activity name:', a.name)
print('Distance:', a.distance)
print('Athlete name:', a.athlete.firstname)
print('Average heart rate:', a.average_heartrate)

Activity name: Afternoon Run
Distance: 16995.00 m
Athlete name: None
Average heart rate: 163.1
