# Analisis de los datos de Strava

In [1]:
%%capture
# Install necessary packages
!pip install fitdecode
!pip install Path
!pip install zipfile36
!pip install dateparser

Importing the necesary data

In [36]:
# Importing packages
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import dateparser

# Positions of relevant columns
usecols = [0, 1, 2, 3, 4, 5, 7, 8, 12, 16, 17, 18, 20, 31]

# English column names
names = [
    "Activity ID",
    "Activity Date",
    "Activity Name",
    "Activity Type",
    "Activity Description",
    "Elapsed Time",
    "Max Heart Rate",
    "Relative Effort",
    "Filename",
    "Moving Time",
    "Distance",
    "Max Pace",
    "Elevation Gain",
    "Average Heart Rate",
]

# Reading the raw data for reference
raw = pd.read_csv("activities.csv")

# Reading the raw data with preprocessing
df = pd.read_csv(
    "activities.csv",
    index_col=0,
    parse_dates=[1],
    usecols=usecols,
    names=names,
    header=0,
    date_parser=dateparser.parse,
)

print(f"{raw.shape[0]} rows in raw file")

# Drop columns with missing values in Moving Time and Distance
df = df.dropna(axis=0, subset=["Moving Time", "Distance"])

print(f"{df.shape[0]} rows remaining after cleaning")

# Creating new columns:
# Add day, week, month, quarter, year columns
names = ["Day", "Week", "Month", "Quarter", "Year"]
periods = ["D", "W", "M", "Q", "y"]
for n, p in zip(names, periods):
    df.insert(3, n, df["Activity Date"].dt.to_period(p).astype(str))
# Convert moving time from seconds to hours
df.insert(13, "Moving Time (hr)", df["Moving Time"] / 3600)
# Convert distance from meters to kilometers
df.insert(16, "Distance (km)", df["Distance"] / 1000)
# Calculate average speed
df.insert(17, "Average Speed (km/hr)", df["Distance (km)"] / df["Moving Time (hr)"])

# Calculate maximum speed
df.insert(18, "Max Speed (km/hr)", df["Max Pace"]*3.6)

# Print date bounds of the data
print(f"Ranges from {df.Day.min()} to {df.Day.max()}")

# Preview the data
df.tail()


The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.



550 rows in raw file
550 rows remaining after cleaning
Ranges from 2018-09-09 to 2023-08-30


Unnamed: 0_level_0,Activity Date,Activity Name,Activity Type,Year,Quarter,Month,Week,Day,Activity Description,Elapsed Time,...,Filename,Moving Time (hr),Moving Time,Distance,Distance (km),Average Speed (km/hr),Max Speed (km/hr),Max Pace,Elevation Gain,Average Heart Rate
Activity ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9638976449,2023-08-13 09:34:26,Bicicleta a la hora del almuerzo,Bicicleta,2023,2023Q3,2023-08,2023-08-07/2023-08-13,2023-08-13,,9806,...,activities/10334897911.fit.gz,2.180833,7851.0,44452.949219,44.452949,20.383469,55.281095,15.35586,1087.0,
9644330136,2023-08-05 16:55:14,Carrera de noche,Carrera,2023,2023Q3,2023-08,2023-07-31/2023-08-06,2023-08-05,,1870,...,activities/10340562580.fit.gz,0.516111,1858.0,6679.859863,6.67986,12.942678,26.819999,7.45,0.0,185.839615
9683261634,2023-08-20 06:21:37,Bicicleta por la mañana,Bicicleta,2023,2023Q3,2023-08,2023-08-14/2023-08-20,2023-08-20,,4957,...,activities/10381618854.fit.gz,1.183333,4260.0,24721.830078,24.72183,20.891687,49.886248,13.857291,535.0,
9750863053,2023-08-21 17:16:47,Bicicleta al anochecer,Bicicleta,2023,2023Q3,2023-08,2023-08-21/2023-08-27,2023-08-21,,6049,...,activities/10453731608.fit.gz,1.405556,5060.0,25067.339844,25.06734,17.834471,36.490079,10.136133,62.0,
9750871220,2023-08-30 15:58:50,Bicicleta por la tarde,Bicicleta,2023,2023Q3,2023-08,2023-08-28/2023-09-03,2023-08-30,,5646,...,activities/10453739983.fit.gz,1.412778,5086.0,29676.060547,29.676061,21.00547,54.762891,15.211914,667.0,


Calculating the cumulative sums

In [7]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_km = "Month"

# Group by time_unit_km and activity type
df_km = df.groupby(by=[time_unit_km, "Activity Type"], as_index=False).agg(
    count=("Distance (km)", "count"),
    total_distance_km=("Distance (km)", "sum"),
    avg_distance_km=("Distance (km)", "mean"),
)

# For each activity and time period, make sure there exists a row
# This will ensure there is point on the plot for each combination
acts = df_km["Activity Type"].unique()
times = df_km[time_unit_km].unique()

# Create a list to store new rows
new_rows = []

for a in acts:
    temp = df_km.loc[df_km["Activity Type"] == a]
    for t in times:
        if not (temp[time_unit_km] == t).any():
            new_row = {
                time_unit_km: t,
                "Activity Type": a,
                "count": 0,
                "total_distance_km": 0,
                "avg_distance_km": 0,
            }
            new_rows.append(new_row)

# Concatenate the new rows to the DataFrame
df_km = pd.concat([df_km, pd.DataFrame(new_rows)], ignore_index=True)
# Find and exclude activities with <= 1 km total covered (e.g., weight training)
# You can increase or decrease this cutoff based on your data
kms = df_km.groupby(by=["Activity Type"], as_index=False).sum()
kms = kms[kms["total_distance_km"] > 1]

# For each activity and time period, calculate the cumulative sum of kms
csum = df_km.loc[df_km["Activity Type"].isin(kms["Activity Type"])]
x = pd.Series(dtype=float)
csum.sort_values(by=["Activity Type", time_unit_km], inplace=True)
csum["csum_km"] = csum.groupby("Activity Type")["total_distance_km"].cumsum()

csum

Unnamed: 0,Month,Activity Type,count,total_distance_km,avg_distance_km,csum_km
0,2018-09,Bicicleta,6,159.075098,26.512516,159.075098
100,2018-10,Bicicleta,0,0.000000,0.000000,159.075098
2,2018-11,Bicicleta,1,25.858400,25.858400,184.933499
4,2018-12,Bicicleta,2,25.319801,12.659900,210.253299
5,2019-01,Bicicleta,1,44.595602,44.595602,254.848901
...,...,...,...,...,...,...
91,2023-04,Carrera,14,126.872651,9.062332,2063.925490
93,2023-05,Carrera,14,147.999292,10.571378,2211.924782
95,2023-06,Carrera,16,137.116259,8.569766,2349.041041
97,2023-07,Carrera,9,91.196648,10.132961,2440.237690


## Análisis general

Gráfica de los kilometros recorridos por tipo de actividad

In [11]:
# For the plot tile
total_km = round(df_km["total_distance_km"].sum())

# Plot a stacked area plot
fig_km = px.area(
    csum,
    x=time_unit_km,
    y="csum_km",
    color="Activity Type",
    title=f"My {total_km} Kilometers on Strava!",  # Set title text
    hover_data={  # Define variables for hover text
        "csum_km": ":.1f",
        "count": ":f",
        "total_distance_km": ":.1f",
        "avg_distance_km": ":.1f",
    },
    labels=dict(  # Define labels for variables
        count="Number of activities",
        avg_distance_km="Average kms per activity",
        total_distance_km="Total kms covered",
        csum_km="Cumulative kms covered",
    ),
    color_discrete_sequence=px.colors.qualitative.Bold,  # Define color swatch
)

# Set max allowed of ticks on x and y axes
fig_km.update_xaxes(nticks=20)
fig_km.update_yaxes(nticks=15)

# Adjust the size and layout
fig_km.update_layout(
    autosize=False,
    width=700,
    height=500,
    template="plotly_white",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
    title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"},  # Center title
)

fig_km.show()

* Las salidas en bicicleta acumulan la gran parte de los kilometros recorridos

Horas dedicadas a cada tipo de actividad

In [12]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_bar = "Month"

# Group by time_unit_bar and activity type
df_hr = df.groupby(by=[time_unit_bar, "Activity Type"], as_index=False).agg(
    count=("Moving Time (hr)", "count"),
    total_hr_spent=("Moving Time (hr)", "sum"),
    avg_hr_spent=("Moving Time (hr)", "mean"),
)

# For the plot tile
total_hr = round(df_hr["total_hr_spent"].sum())

# Plot a stacked bar plot
fig_hr = px.bar(
    df_hr,
    x=time_unit_bar,
    y="total_hr_spent",
    color="Activity Type",
    title=f"My {total_hr} hours on Strava!",  # Set title text
    hover_data={  # Define variables for hover text
        "count": ":f",
        "total_hr_spent": ":.1f",
        "avg_hr_spent": ":.1f",
    },
    labels=dict(  # Define labels for variables
        total_hr_spent="Total hrs spent",
        count="Number of activities",
        avg_hr_spent="Average hrs spent per activity",
    ),
    color_discrete_sequence=px.colors.qualitative.Bold,  # Define color swatch
)

# Set max allowed of ticks on x and y axes
fig_hr.update_xaxes(nticks=20)
fig_hr.update_yaxes(nticks=15)

# Adjust the size and layout
fig_hr.update_layout(
    autosize=False,
    width=700,
    height=500,
    template="plotly_white",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
    legend=dict(  # Move the legend to the bottom
        orientation="h",
        yanchor="bottom",
        y=-0.6,
        xanchor="right",
        x=1,
        title=None,  # Remove legend title
    ),
    title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"},  # Center title
)

fig_hr.show()

* Se observa una evolución histórica desde más horas de bicicleta a más horas de correr
* La estacionalidad muestra que los meses de verano hay más horas dedicadas a la bicicleta

## Análisis de las actividades de carrera

In [14]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Activity Date",
    y="Average Speed (km/hr)",
    facet_col="distance_bin",
    color="Distance (km)",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    category_orders={"distance_bin": bin_labels},  # Ascending order
    custom_data=["Activity Name", "Distance (km)", "Elevation Gain"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate="Activity Name: %{customdata[0]}<br>"
    "Activity date: %{x|%Y-%m-%d}<br>"
    "Distance (km): %{customdata[1]:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[2]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)

# Hide subplot annotations and x-axis titles
fig_s.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for axis in fig_s.layout:
    if type(fig_s.layout[axis]) == go.layout.XAxis:
        fig_s.layout[axis].title.text = ""

# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="km",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()

In [24]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Distance (km)",
    y="Average Speed (km/hr)",
    color="Elevation Gain",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    custom_data=["Elevation Gain", "Activity Date"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate=
    "Activity Date: %{customdata[1]|%Y-%m-%d}<br>"
    "Distance (km): %{x:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[0]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)


# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="Elevation gain",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()

In [30]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_bar = "Week"
df_carrera=df[(df['Activity Type']=='Carrera') & (df['Activity Date']>"2022-09-01 00:00:00")]
# Group by time_unit_bar and activity type
df_hr = df_carrera.groupby(by=[time_unit_bar, "Activity Type"], as_index=False).agg(
    count=("Moving Time (hr)", "count"),
    total_hr_spent=("Distance (km)", "sum"),
    avg_hr_spent=("Distance (km)", "mean"),
)

# For the plot tile
total_hr = round(df_hr["total_hr_spent"].sum())

# Plot a stacked bar plot
fig_hr = px.bar(
    df_hr,
    x=time_unit_bar,
    y="total_hr_spent",
    color="avg_hr_spent",
    title=f"My {total_hr} hours on Strava!",  # Set title text
    hover_data={  # Define variables for hover text
        "count": ":f",
        "total_hr_spent": ":.1f",
        "avg_hr_spent": ":.1f",
    },
    labels=dict(  # Define labels for variables
        total_hr_spent="Total hrs spent",
        count="Number of activities",
        avg_hr_spent="Average hrs spent per activity",
    ),
    color_discrete_sequence=px.colors.qualitative.Bold,  # Define color swatch
)

# Set max allowed of ticks on x and y axes
fig_hr.update_xaxes(nticks=20)
fig_hr.update_yaxes(nticks=15)

# Adjust the size and layout
fig_hr.update_layout(
    autosize=False,
    width=700,
    height=500,
    template="plotly_white",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
    legend=dict(  # Move the legend to the bottom
        orientation="h",
        yanchor="bottom",
        y=-0.6,
        xanchor="right",
        x=1,
        title=None,  # Remove legend title
    ),
    title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"},  # Center title
)

fig_hr.show()

## Análisis de las actividades de bicicleta

In [31]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[-1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Activity Date",
    y="Average Speed (km/hr)",
    facet_col="distance_bin",
    color="Distance (km)",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    category_orders={"distance_bin": bin_labels},  # Ascending order
    custom_data=["Activity Name", "Distance (km)", "Elevation Gain"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate="Activity Name: %{customdata[0]}<br>"
    "Activity date: %{x|%Y-%m-%d}<br>"
    "Distance (km): %{customdata[1]:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[2]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)

# Hide subplot annotations and x-axis titles
fig_s.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for axis in fig_s.layout:
    if type(fig_s.layout[axis]) == go.layout.XAxis:
        fig_s.layout[axis].title.text = ""

# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="km",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()

In [35]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[-1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Elevation Gain",
    y="Average Speed (km/hr)",
    color="Distance (km)",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    custom_data=["Elevation Gain", "Activity Date"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate=
    "Activity Date: %{customdata[1]|%Y-%m-%d}<br>"
    "Distance (km): %{x:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[0]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)


# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="Elevation gain",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()