**Data import**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import datetime as dt
import glob
import os
import calendar
import datetime

%matplotlib inline

In [None]:
year = input('Enter a year:\n')

In [None]:
# Importing and merging the files
joined_files = os.path.join(f"//content/drive/MyDrive/Dane z aplikacji/Garmin Connect/Kroki/Data_{year}", "*.csv")

In [None]:
# A list of all joined files
joined_list = glob.glob(joined_files)

# First version of the dataframe
steps = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

#Variables
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

**Data check**

In [None]:
# Checking the dimensions of the dataframe
steps.shape

In [None]:
# Checking the types of data
steps.dtypes

In [None]:
steps.sample(5)

In [None]:
steps.head(10)

In [None]:
# Deleting the last column in the dataframe
steps = steps.iloc[: , :-1]
steps.columns

In [None]:
# Deleting rows from previous year
previous_year = str(int(year)-1)
n = len(steps[steps.iloc[:, 0].str.endswith(previous_year)])
steps = steps.iloc[n:]
print(f"Deleted {n} rows.")

**Data cleaning**

In [None]:
# Renaming the column names
steps.rename(columns = {'Unnamed: 0':'Date','Bieżąca':'Steps'}, inplace = True)
steps.head()

In [None]:
steps['Date'] = steps['Date'].astype('str')
steps['Date'] = steps['Date'].str.strip()
steps['Day'] = ''
steps['Month'] = ''
steps['Year'] = year


for index, row in steps.iterrows():
    date_str = row['Date']

    if '/' in date_str:
        day, month, year = date_str.split('/')
        new_date = f"{year.zfill(4)}-{month.zfill(2)}-{day.zfill(2)}"

        steps.at[index, 'Day'] = day.zfill(2)
        steps.at[index, 'Month'] = month
        steps.at[index, 'Year'] = year.zfill(4)
    else:

        steps.at[index, 'Day'] = '01'
        steps.at[index, 'Month'] = '01'
        steps.at[index, 'Year'] = '2023'

    steps.at[index, 'New_date'] = new_date


print(steps.head())

In [None]:
# Checking for duplicated dates
check = steps[steps.duplicated(['New_date'], keep=False)]

if check.empty:
  duplicates = False
  print("There are no duplicates.")
else:
  duplicates = True
  print("There are duplicates.")


In [None]:
if duplicates:
  steps = steps.drop_duplicates(keep='first')
  steps = steps.sort_values(['New_date', 'Steps'], ascending=[True, True])
  steps = steps.drop_duplicates(subset=['New_date'], keep='last')
  steps = steps.sort_values(['New_date'], ascending=[True])
  check = steps[steps.duplicated(['New_date'], keep=False)]
  if check.empty:
    duplicates = False
    print("There are no duplicates.")
  else:
    duplicates = True
    print("There are still duplicates.")

In [None]:
steps.tail()

In [None]:
# Remove rows from current month
current_date = pd.Timestamp.now()
steps['Date'] = pd.to_datetime(steps['Date'], format='%d/%m/%Y')
steps = steps[~((steps['Date'].dt.month == current_date.month) & (steps['Date'].dt.year == current_date.year))]
steps.tail()

In [None]:
x = (steps[steps.iloc[:, 1] == 0])
print(f'There are {len(x)} null values')
if len(x) != 0:
  steps = steps.drop(steps[steps.iloc[:, 1] == 0].index)
  display(steps)

In [None]:
# Sorting dataframe by New_date column values
steps = steps.sort_values(by="New_date")
steps.tail(10)

In [None]:
# Changing type of New_data column
steps['New_date'] = [dt.datetime.strptime(x,"%Y-%m-%d") for x in steps['New_date']]
print("Type of New_data column has been changed.")

In [None]:
# Creating new columns: Weekday and Month and reseting the index
steps['Weekday'] = steps['New_date'].dt.day_name()
steps['Month'] = steps['New_date'].dt.month_name()
steps.reset_index(inplace=True)
steps.head()

In [None]:
def get_season(date):
    month = date.month
    day = date.day
    if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day < 21):
        return 'Winter'
    elif (month == 3 and day >= 21) or (month == 4) or (month == 5) or (month == 6 and day < 21):
        return 'Spring'
    elif (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day < 23):
        return 'Summer'
    elif (month == 9 and day >= 23) or (month == 10) or (month == 11) or (month == 12 and day < 21):
        return 'Autumn'
    else:
        return 'Unknown'

steps['Season'] = steps['New_date'].apply(get_season)

In [None]:
steps.tail()

In [None]:
steps.head()

In [None]:
# Reordering the columns
df = steps[['New_date', 'Month', 'Weekday','Season','Steps']]
df.head()

In [None]:
# Checking if the length of dataframe is appropriate

cy = datetime.date.today().year
cdy = (df['New_date'].iloc[-1]).timetuple().tm_yday

if cy != 2023:
  n = 365
  if n-len(df) == 0:
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")
else:
  if cdy == len(df):
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")

In [None]:
df.to_csv(f'/content/drive/MyDrive/Skrypty/Tableau/Outputs/Steps_export_{year}.csv')
print("The file has been exported to the intended location.")

In [None]:
# Annual statistics (sum, mean, max, min) and the calculation of goal achievement
x = df["Steps"]

x1 = str(x.sum())
x2 = str(round(x.mean()))
x3 = str(x.max())
x4 = str(x.min())

goal = df['Steps']> 10000
goal = df[goal]
res = len(goal.index)
x5 = "{:.0%}".format(res/365)

print(
f"In {year} I did:\n\n" + x1 +
" - total steps,\n" + x2 +
" - average,\n" + x3 +
" - max in a day,\n" + x4 +
" - min in a day.\n\nMoreover I had " +
str(res) + ' days when the goal of 10 000 steps per day was achieved and it constitutes ' +
x5 +'.')