**Data import**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl 
import datetime as dt
import glob
import os
import calendar
import datetime

%matplotlib inline

In [None]:
year = '2023'

In [None]:
# Importing and merging the files
joined_files = os.path.join("/content/drive/MyDrive/Colab Notebooks/EDA_with_sports_data/Data_2023", "*.csv")

In [None]:
# A list of all joined files
joined_list = glob.glob(joined_files)

# First version of the dataframe
steps = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

#Variables
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

**Data check**

In [None]:
# Checking the dimensions of the dataframe
steps.shape

(70, 3)

In [None]:
# Checking the types of data
steps.dtypes

Unnamed: 0     object
Bieżąca         int64
Cel           float64
dtype: object

In [None]:
# Samples from dataframe
steps.sample(5)

Unnamed: 0.1,Unnamed: 0,Bieżąca,Cel
59,15 lut,19618,10000.0
11,6 sty,8072,10000.0
63,19 lut,12749,10000.0
8,3 sty,10729,10000.0
7,2 sty,3005,10000.0


In [None]:
steps.head(10)

Unnamed: 0.1,Unnamed: 0,Bieżąca,Cel
0,26 gru,4979,10000.0
1,27 gru,19797,10000.0
2,28 gru,4431,10000.0
3,29 gru,3885,10000.0
4,30 gru,3709,10000.0
5,31 gru,14978,10000.0
6,1 sty,16846,10000.0
7,2 sty,3005,10000.0
8,3 sty,10729,10000.0
9,4 sty,3633,10000.0


In [None]:
# Deleting rows from previous year
n = len(steps[steps.iloc[:, 0].str.endswith('gru')])
steps = steps.iloc[n:]
print(f"Deleted {n} rows.")

Deleted 6 rows.


**Data cleaning**

In [None]:
# Deleting the last column in the dataframe
steps = steps.iloc[: , :-1]
steps.columns

Index(['Unnamed: 0', 'Bieżąca'], dtype='object')

In [None]:
# Renaming the column names
steps.rename(columns = {'Unnamed: 0':'Date','Bieżąca':'Steps'}, inplace = True)
steps.head()

Unnamed: 0,Date,Steps
6,1 sty,16846
7,2 sty,3005
8,3 sty,10729
9,4 sty,3633
10,5 sty,5910


In [None]:
# Converting the Date column into string
# Splitting the Date column into Day, Month and Year
# Filling 0 where needed in the Day column

steps['Date'] = steps['Date'].astype('str')
steps['Date'] = steps['Date'].str.strip()
new = steps["Date"].str.split(" ", n = 1, expand = True)
steps["Day"]= new[0]
steps["Month"]= new[1]
steps["Year"]= year

n = 2
steps['Day'] = steps['Day'].str.zfill(n)
steps.head()

Unnamed: 0,Date,Steps,Day,Month,Year
6,1 sty,16846,1,sty,2023
7,2 sty,3005,2,sty,2023
8,3 sty,10729,3,sty,2023
9,4 sty,3633,4,sty,2023
10,5 sty,5910,5,sty,2023


In [None]:
# Replacing polish shortcuts of months names into appropriate numbers
steps['Month'] = steps['Month'].replace(['sty', 'lut', 'mar', 'kwi', 'maj', 'cze', 'lip', 'sie', 'wrz', 'paź', 'lis','gru'],['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'])
steps.head()

Unnamed: 0,Date,Steps,Day,Month,Year
6,1 sty,16846,1,1,2023
7,2 sty,3005,2,1,2023
8,3 sty,10729,3,1,2023
9,4 sty,3633,4,1,2023
10,5 sty,5910,5,1,2023


In [None]:
# Concatenating three columns (Day, Month, Year) into one (New_date)
steps['New_date'] = steps['Year'] + '-' + steps['Month'] + '-' + steps['Day']
#pd.to_datetime(steps2['New_date'])
steps.head()

Unnamed: 0,Date,Steps,Day,Month,Year,New_date
6,1 sty,16846,1,1,2023,2023-01-01
7,2 sty,3005,2,1,2023,2023-01-02
8,3 sty,10729,3,1,2023,2023-01-03
9,4 sty,3633,4,1,2023,2023-01-04
10,5 sty,5910,5,1,2023,2023-01-05


In [None]:
# Checking for duplicated dates
check = steps[steps.duplicated(['New_date'], keep=False)]

if check.empty:
  duplicates = False
  print("There are no duplicates.")
else:
  duplicates = True
  print("There are duplicates.")


There are duplicates.


In [None]:
  display(check)

Unnamed: 0,Date,Steps,Day,Month,Year,New_date
48,12 lut,25247,12,2,2023,2023-02-12
49,13 lut,24578,13,2,2023,2023-02-13
50,14 lut,13204,14,2,2023,2023-02-14
51,15 lut,19618,15,2,2023,2023-02-15
52,16 lut,11782,16,2,2023,2023-02-16
53,17 lut,4497,17,2,2023,2023-02-17
54,18 lut,0,18,2,2023,2023-02-18
55,19 lut,0,19,2,2023,2023-02-19
56,12 lut,25247,12,2,2023,2023-02-12
57,13 lut,24578,13,2,2023,2023-02-13


In [None]:
if duplicates:
  steps = steps.drop_duplicates(keep='first')
  steps = steps.sort_values(['New_date', 'Steps'], ascending=[True, True])
  steps = steps.drop_duplicates(subset=['New_date'], keep='last')
  steps = steps.sort_values(['New_date'], ascending=[True])
  check = steps[steps.duplicated(['New_date'], keep=False)]
  if check.empty:
    duplicates = False
    print("There are no duplicates.")
  else:
    duplicates = True
    print("There are still duplicates.")

There are no duplicates.


In [None]:
steps.tail()

Unnamed: 0,Date,Steps,Day,Month,Year,New_date
65,21 lut,10818,21,2,2023,2023-02-21
66,22 lut,8937,22,2,2023,2023-02-22
67,23 lut,10588,23,2,2023,2023-02-23
68,24 lut,0,24,2,2023,2023-02-24
69,25 lut,0,25,2,2023,2023-02-25


In [None]:
x = (steps[steps.iloc[:, 1] == 0])
print(f'There are {len(x)} null values')
if len(x) != 0:
  steps = steps.drop(steps[steps.iloc[:, 1] == 0].index)
  display(steps)

There are 2 null values


Unnamed: 0,Date,Steps,Day,Month,Year,New_date
6,1 sty,16846,1,1,2023,2023-01-01
7,2 sty,3005,2,1,2023,2023-01-02
8,3 sty,10729,3,1,2023,2023-01-03
9,4 sty,3633,4,1,2023,2023-01-04
10,5 sty,5910,5,1,2023,2023-01-05
11,6 sty,8072,6,1,2023,2023-01-06
12,7 sty,5314,7,1,2023,2023-01-07
13,8 sty,14479,8,1,2023,2023-01-08
14,9 sty,5067,9,1,2023,2023-01-09
15,10 sty,7507,10,1,2023,2023-01-10


In [None]:
# Sorting dataframe by New_date column values
steps = steps.sort_values(by="New_date")
steps.tail(20)

Unnamed: 0,Date,Steps,Day,Month,Year,New_date
40,4 lut,10564,4,2,2023,2023-02-04
41,5 lut,7273,5,2,2023,2023-02-05
42,6 lut,10873,6,2,2023,2023-02-06
43,7 lut,10655,7,2,2023,2023-02-07
44,8 lut,3469,8,2,2023,2023-02-08
45,9 lut,7069,9,2,2023,2023-02-09
46,10 lut,8331,10,2,2023,2023-02-10
47,11 lut,25457,11,2,2023,2023-02-11
48,12 lut,25247,12,2,2023,2023-02-12
49,13 lut,24578,13,2,2023,2023-02-13


In [None]:
# Changing type of New_data column
steps['New_date'] = [dt.datetime.strptime(x,"%Y-%m-%d") for x in steps['New_date']]
print("Type of New_data column has been changed.")

Type of New_data column has been changed.


In [None]:
# Creating new columns: Weekday and Month and reseting the index
steps['Weekday'] = steps['New_date'].dt.day_name()
steps['Month'] = steps['New_date'].dt.month_name()
steps.reset_index(inplace=True)
steps.head()

Unnamed: 0,index,Date,Steps,Day,Month,Year,New_date,Weekday
0,6,1 sty,16846,1,January,2023,2023-01-01,Sunday
1,7,2 sty,3005,2,January,2023,2023-01-02,Monday
2,8,3 sty,10729,3,January,2023,2023-01-03,Tuesday
3,9,4 sty,3633,4,January,2023,2023-01-04,Wednesday
4,10,5 sty,5910,5,January,2023,2023-01-05,Thursday


In [None]:
# Assigning seasons to dates
winter_1 = (steps['New_date'] >= '2023-01-01') & (steps['New_date'] <= '2023-03-20')
winter_2 = (steps['New_date'] >= '2023-12-21')
spring = (steps['New_date'] >= '2023-03-21') & (steps['New_date'] <= '2023-06-21')
summer = (steps['New_date'] >= '2023-06-22') & (steps['New_date'] <= '2023-09-22')
autumn = (steps['New_date'] >= '2023-09-23') & (steps['New_date'] <= '2023-12-21')

steps.loc[winter_1,'Season'] ='Winter'
steps.loc[winter_2,'Season'] ='Winter'
steps.loc[spring,'Season'] ='Spring'
steps.loc[summer,'Season'] ='Summer'
steps.loc[autumn,'Season'] ='Autumn'

In [None]:
steps.head()

Unnamed: 0,index,Date,Steps,Day,Month,Year,New_date,Weekday,Season
0,6,1 sty,16846,1,January,2023,2023-01-01,Sunday,Winter
1,7,2 sty,3005,2,January,2023,2023-01-02,Monday,Winter
2,8,3 sty,10729,3,January,2023,2023-01-03,Tuesday,Winter
3,9,4 sty,3633,4,January,2023,2023-01-04,Wednesday,Winter
4,10,5 sty,5910,5,January,2023,2023-01-05,Thursday,Winter


In [None]:
steps.tail(10)

Unnamed: 0,index,Date,Steps,Day,Month,Year,New_date,Weekday,Season
44,50,14 lut,13204,14,February,2023,2023-02-14,Tuesday,Winter
45,51,15 lut,19618,15,February,2023,2023-02-15,Wednesday,Winter
46,52,16 lut,11782,16,February,2023,2023-02-16,Thursday,Winter
47,61,17 lut,5608,17,February,2023,2023-02-17,Friday,Winter
48,62,18 lut,9107,18,February,2023,2023-02-18,Saturday,Winter
49,63,19 lut,12749,19,February,2023,2023-02-19,Sunday,Winter
50,64,20 lut,5176,20,February,2023,2023-02-20,Monday,Winter
51,65,21 lut,10818,21,February,2023,2023-02-21,Tuesday,Winter
52,66,22 lut,8937,22,February,2023,2023-02-22,Wednesday,Winter
53,67,23 lut,10588,23,February,2023,2023-02-23,Thursday,Winter


In [None]:
# Reordering the columns
df = steps[['New_date', 'Month', 'Weekday','Season','Steps']]
df.head()

Unnamed: 0,New_date,Month,Weekday,Season,Steps
0,2023-01-01,January,Sunday,Winter,16846
1,2023-01-02,January,Monday,Winter,3005
2,2023-01-03,January,Tuesday,Winter,10729
3,2023-01-04,January,Wednesday,Winter,3633
4,2023-01-05,January,Thursday,Winter,5910


In [None]:
# Checking if the length of dataframe is appropriate 

cy = datetime.date.today().year
cdy = (df['New_date'].iloc[-1]).timetuple().tm_yday

if cy != 2023:
  n = 365 
  if n-len(df) == 0:
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")
else:
  if cdy == len(df):
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")

Valid number of days.


In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/EDA_with_sports_data/Outputs/Steps_2023.csv')
print("The file has been exported to the intended location.")

The file has been exported to the intended location.


In [None]:
# Annual statistics (sum, mean, max, min) and the calculation of goal achievement
x = df["Steps"]

x1 = str(x.sum())
x2 = str(round(x.mean()))
x3 = str(x.max())
x4 = str(x.min())

goal = df['Steps']> 10000
goal = df[goal]
res = len(goal.index)
x5 = "{:.0%}".format(res/365)

print(
"In 2023 I did:\n\n" + x1 + 
" - total steps,\n" + x2 + 
" - average,\n" + x3 + 
" - max in a day,\n" + x4 + 
" - min in a day.\n\nMoreover I had " + 
str(res) + ' days when the goal of 10 000 steps per day was achieved and it constitutes ' + 
x5 +'.')

In 2023 I did:

532259 - total steps,
9857 - average,
25457 - max in a day,
3005 - min in a day.

Moreover I had 22 days when the goal of 10 000 steps per day was achieved and it constitutes 6%.
