**Data import**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import datetime as dt
import glob
import os
import calendar
import datetime

%matplotlib inline

In [None]:
year = '2023'

In [None]:
# Importing and merging the files
joined_files = os.path.join(f"//content/drive/MyDrive/Dane z aplikacji/Garmin Connect/Kroki/Data_{year}", "*.csv")

In [None]:
# A list of all joined files
joined_list = glob.glob(joined_files)

# First version of the dataframe
steps = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

#Variables
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

**Data check**

In [None]:
# Checking the dimensions of the dataframe
steps.shape

In [None]:
# Checking the types of data
steps.dtypes

In [None]:
# Samples from dataframe
steps.sample(5)

In [None]:
steps.head(10)

In [None]:
# Deleting rows from previous year
n = len(steps[steps.iloc[:, 0].str.endswith('gru')])
steps = steps.iloc[n:]
print(f"Deleted {n} rows.")

**Data cleaning**

In [None]:
# Deleting the last column in the dataframe
steps = steps.iloc[: , :-1]
steps.columns

In [None]:
# Renaming the column names
steps.rename(columns = {'Unnamed: 0':'Date','Bieżąca':'Steps'}, inplace = True)
steps.head()

In [None]:
# Converting the Date column into string
# Splitting the Date column into Day, Month and Year
# Filling 0 where needed in the Day column

steps['Date'] = steps['Date'].astype('str')
steps['Date'] = steps['Date'].str.strip()
new = steps["Date"].str.split(" ", n = 1, expand = True)
steps["Day"] = new[0]
steps["Month"] = new[1]
steps["Year"] = year

n = 2
steps['Day'] = steps['Day'].str.zfill(n)
steps.head()

In [None]:
# Replacing polish shortcuts of months names into appropriate numbers
steps['Month'] = steps['Month'].replace(['sty', 'lut', 'mar', 'kwi', 'maj', 'cze', 'lip', 'sie', 'wrz', 'paź', 'lis','gru'],['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'])
steps.head()

In [None]:
# Concatenating three columns (Day, Month, Year) into one (New_date)
steps['New_date'] = steps['Year'] + '-' + steps['Month'] + '-' + steps['Day']
#pd.to_datetime(steps2['New_date'])
steps.head()

In [None]:
# Checking for duplicated dates
check = steps[steps.duplicated(['New_date'], keep=False)]

if check.empty:
  duplicates = False
  print("There are no duplicates.")
else:
  duplicates = True
  print("There are duplicates.")


In [None]:
  display(check)

In [None]:
if duplicates:
  steps = steps.drop_duplicates(keep='first')
  steps = steps.sort_values(['New_date', 'Steps'], ascending=[True, True])
  steps = steps.drop_duplicates(subset=['New_date'], keep='last')
  steps = steps.sort_values(['New_date'], ascending=[True])
  check = steps[steps.duplicated(['New_date'], keep=False)]
  if check.empty:
    duplicates = False
    print("There are no duplicates.")
  else:
    duplicates = True
    print("There are still duplicates.")

In [None]:
steps.tail()

In [None]:
x = (steps[steps.iloc[:, 1] == 0])
print(f'There are {len(x)} null values')
if len(x) != 0:
  steps = steps.drop(steps[steps.iloc[:, 1] == 0].index)
  display(steps)

In [None]:
# Sorting dataframe by New_date column values
steps = steps.sort_values(by="New_date")
steps.tail(20)

In [None]:
# Changing type of New_data column
steps['New_date'] = [dt.datetime.strptime(x,"%Y-%m-%d") for x in steps['New_date']]
print("Type of New_data column has been changed.")

In [None]:
# Creating new columns: Weekday and Month and reseting the index
steps['Weekday'] = steps['New_date'].dt.day_name()
steps['Month'] = steps['New_date'].dt.month_name()
steps.reset_index(inplace=True)
steps.head()

In [None]:
# Assigning seasons to dates
winter_1 = (steps['New_date'] >= '2023-01-01') & (steps['New_date'] <= '2023-03-20')
winter_2 = (steps['New_date'] >= '2023-12-21')
spring = (steps['New_date'] >= '2023-03-21') & (steps['New_date'] <= '2023-06-21')
summer = (steps['New_date'] >= '2023-06-22') & (steps['New_date'] <= '2023-09-22')
autumn = (steps['New_date'] >= '2023-09-23') & (steps['New_date'] <= '2023-12-21')

steps.loc[winter_1,'Season'] ='Winter'
steps.loc[winter_2,'Season'] ='Winter'
steps.loc[spring,'Season'] ='Spring'
steps.loc[summer,'Season'] ='Summer'
steps.loc[autumn,'Season'] ='Autumn'

In [None]:
steps.head()

In [None]:
steps.tail(10)

In [None]:
# Reordering the columns
df = steps[['New_date', 'Month', 'Weekday','Season','Steps']]
df.head()

In [None]:
# Checking if the length of dataframe is appropriate

cy = datetime.date.today().year
cdy = (df['New_date'].iloc[-1]).timetuple().tm_yday

if cy != 2023:
  n = 365
  if n-len(df) == 0:
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")
else:
  if cdy == len(df):
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")

In [None]:
df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Tableau/Outputs/Steps_{year}.csv')
print("The file has been exported to the intended location.")

In [None]:
# Annual statistics (sum, mean, max, min) and the calculation of goal achievement
x = df["Steps"]

x1 = str(x.sum())
x2 = str(round(x.mean()))
x3 = str(x.max())
x4 = str(x.min())

goal = df['Steps']> 10000
goal = df[goal]
res = len(goal.index)
x5 = "{:.0%}".format(res/365)

print(
"In 2023 I did:\n\n" + x1 +
" - total steps,\n" + x2 +
" - average,\n" + x3 +
" - max in a day,\n" + x4 +
" - min in a day.\n\nMoreover I had " +
str(res) + ' days when the goal of 10 000 steps per day was achieved and it constitutes ' +
x5 +'.')