<a href="https://colab.research.google.com/github/slazur83/Tableau/blob/main/Steps_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data import**

In [150]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import datetime as dt
import glob
import os
import calendar
import datetime

%matplotlib inline

In [151]:
# Importing and merging the files
year = '2023'
joined_files = os.path.join(f"//content/drive/MyDrive/Dane z aplikacji/Garmin Connect/Kroki/Data_{year}", "*.csv")

In [152]:
# A list of all joined files
joined_list = glob.glob(joined_files)

# First version of the dataframe
steps = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

**Data check**

In [153]:
# Checking the dimensions of the dataframe
steps.shape

(371, 3)

In [154]:
# Checking the types of data
steps.dtypes

Unnamed: 0    object
Bieżąca        int64
Cel            int64
dtype: object

In [155]:
# Samples from dataframe
steps.sample(5)

Unnamed: 0.1,Unnamed: 0,Bieżąca,Cel
292,27/07/2023,12173,10000
111,17/02/2023,5608,10000
145,16/03/2023,15229,10000
231,29/07/2023,12043,10000
267,09/07/2023,4893,10000


**Data cleaning**

In [156]:
# Renaming the column names
steps.rename(columns = {'Unnamed: 0':'Date','Bieżąca':'Steps'}, inplace = True)
steps.head()

Unnamed: 0,Date,Steps,Cel
0,18/11/2023,7035,10000
1,19/11/2023,4265,10000
2,20/11/2023,7556,10000
3,21/11/2023,6322,10000
4,22/11/2023,8945,10000


In [157]:
steps['Date'] = steps['Date'].astype('str')
steps['Date'] = steps['Date'].str.strip()

condition = ~steps['Date'].str.contains('2023')

steps = steps[~condition]
print(steps.head())

         Date  Steps    Cel
0  18/11/2023   7035  10000
1  19/11/2023   4265  10000
2  20/11/2023   7556  10000
3  21/11/2023   6322  10000
4  22/11/2023   8945  10000


In [158]:
steps['Year'] = ''
steps['Month'] = ''
steps['Day'] = ''

steps['New_date'] = pd.to_datetime(steps['Date'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')

steps['Year'] = pd.to_datetime(steps['Date'], format='%d/%m/%Y').dt.year.astype(str)
steps['Month'] = pd.to_datetime(steps['Date'], format='%d/%m/%Y').dt.month.astype(str).str.zfill(2)
steps['Day'] = pd.to_datetime(steps['Date'], format='%d/%m/%Y').dt.day.astype(str).str.zfill(2)

steps = steps.drop('Cel', axis=1, errors='ignore')
steps = steps.drop('Date', axis=1, errors='ignore')
cols = ['New_date'] + [col for col in steps if col != 'New_date']
steps = steps[cols]
steps = steps.sort_values(by='New_date')
print(steps.head())

      New_date  Steps  Year Month Day
15  2023-01-01  16846  2023    01  01
16  2023-01-02   3005  2023    01  02
17  2023-01-03  10729  2023    01  03
18  2023-01-04   3633  2023    01  04
19  2023-01-05   5910  2023    01  05


In [159]:
# Checking for duplicated dates
check = steps[steps.duplicated(['New_date'], keep=False)]

if check.empty:
  duplicates = False
  print("There are no duplicates.")
else:
  duplicates = True
  print("There are duplicates.")


There are no duplicates.


In [122]:
  display(check)

Unnamed: 0,New_date,Date,Steps,Cel,Year,Month,Day,FormattedDate


In [123]:
if duplicates:
  steps = steps.drop_duplicates(keep='first')
  steps = steps.sort_values(['New_date', 'Steps'], ascending=[True, True])
  steps = steps.drop_duplicates(subset=['New_date'], keep='last')
  steps = steps.sort_values(['New_date'], ascending=[True])
  check = steps[steps.duplicated(['New_date'], keep=False)]
  if check.empty:
    duplicates = False
    print("There are no duplicates.")
  else:
    duplicates = True
    print("There are still duplicates.")

In [160]:
x = (steps[steps.iloc[:, 1] == 0])
print(f'There are {len(x)} null values')
if len(x) != 0:
  steps = steps.drop(steps[steps.iloc[:, 1] == 0].index)
  display(steps)

There are 0 null values


In [161]:
# Changing type of New_data column
steps['New_date'] = [dt.datetime.strptime(x,"%Y-%m-%d") for x in steps['New_date']]
print("Type of New_data column has been changed.")

Type of New_data column has been changed.


In [162]:
# Creating new columns: Weekday and Month and reseting the index
steps['Weekday'] = steps['New_date'].dt.day_name()
steps['Month'] = steps['New_date'].dt.month_name()
steps.reset_index(inplace=True)
steps.head()

Unnamed: 0,index,New_date,Steps,Year,Month,Day,Weekday
0,15,2023-01-01,16846,2023,January,1,Sunday
1,16,2023-01-02,3005,2023,January,2,Monday
2,17,2023-01-03,10729,2023,January,3,Tuesday
3,18,2023-01-04,3633,2023,January,4,Wednesday
4,19,2023-01-05,5910,2023,January,5,Thursday


In [163]:
# Assigning seasons to dates
winter_1 = (steps['New_date'] >= '2023-01-01') & (steps['New_date'] <= '2023-03-20')
winter_2 = (steps['New_date'] >= '2023-12-21')
spring = (steps['New_date'] >= '2023-03-21') & (steps['New_date'] <= '2023-06-21')
summer = (steps['New_date'] >= '2023-06-22') & (steps['New_date'] <= '2023-09-22')
autumn = (steps['New_date'] >= '2023-09-23') & (steps['New_date'] <= '2023-12-21')

steps.loc[winter_1,'Season'] ='Winter'
steps.loc[winter_2,'Season'] ='Winter'
steps.loc[spring,'Season'] ='Spring'
steps.loc[summer,'Season'] ='Summer'
steps.loc[autumn,'Season'] ='Autumn'

In [164]:
steps.head()

Unnamed: 0,index,New_date,Steps,Year,Month,Day,Weekday,Season
0,15,2023-01-01,16846,2023,January,1,Sunday,Winter
1,16,2023-01-02,3005,2023,January,2,Monday,Winter
2,17,2023-01-03,10729,2023,January,3,Tuesday,Winter
3,18,2023-01-04,3633,2023,January,4,Wednesday,Winter
4,19,2023-01-05,5910,2023,January,5,Thursday,Winter


In [165]:
steps.tail(10)

Unnamed: 0,index,New_date,Steps,Year,Month,Day,Weekday,Season
355,41,2023-12-22,7268,2023,December,22,Friday,Winter
356,42,2023-12-23,18898,2023,December,23,Saturday,Winter
357,43,2023-12-24,5253,2023,December,24,Sunday,Winter
358,44,2023-12-25,12551,2023,December,25,Monday,Winter
359,45,2023-12-26,6963,2023,December,26,Tuesday,Winter
360,46,2023-12-27,6157,2023,December,27,Wednesday,Winter
361,47,2023-12-28,6811,2023,December,28,Thursday,Winter
362,48,2023-12-29,5228,2023,December,29,Friday,Winter
363,21,2023-12-30,13593,2023,December,30,Saturday,Winter
364,22,2023-12-31,7339,2023,December,31,Sunday,Winter


In [166]:
# Reordering the columns
df = steps[['New_date', 'Month', 'Weekday','Season','Steps']]
df.head()

Unnamed: 0,New_date,Month,Weekday,Season,Steps
0,2023-01-01,January,Sunday,Winter,16846
1,2023-01-02,January,Monday,Winter,3005
2,2023-01-03,January,Tuesday,Winter,10729
3,2023-01-04,January,Wednesday,Winter,3633
4,2023-01-05,January,Thursday,Winter,5910


In [167]:
# Checking if the length of dataframe is appropriate

cy = datetime.date.today().year
cdy = (df['New_date'].iloc[-1]).timetuple().tm_yday

if cy != 2023:
  n = 365
  if n-len(df) == 0:
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")
else:
  if cdy == len(df):
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")

Valid number of days.


In [168]:
df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Tableau/Outputs/Steps_{year}.csv')
print("The file has been exported to the intended location.")

The file has been exported to the intended location.


In [169]:
# Annual statistics (sum, mean, max, min) and the calculation of goal achievement
x = df["Steps"]

x1 = str(x.sum())
x2 = str(round(x.mean()))
x3 = str(x.max())
x4 = str(x.min())

goal = df['Steps']> 10000
goal = df[goal]
res = len(goal.index)
x5 = "{:.0%}".format(res/365)

print(
"In 2023 I did:\n\n" + x1 +
" - total steps,\n" + x2 +
" - average,\n" + x3 +
" - max in a day,\n" + x4 +
" - min in a day.\n\nMoreover I had " +
str(res) + ' days when the goal of 10 000 steps per day was achieved and it constitutes ' +
x5 +'.')

In 2023 I did:

3769356 - total steps,
10327 - average,
28336 - max in a day,
2196 - min in a day.

Moreover I had 179 days when the goal of 10 000 steps per day was achieved and it constitutes 49%.
