**Data import**

In [189]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import glob
import os
import calendar
import datetime

%matplotlib inline

In [152]:
# Importing and merging the files
joined_files = os.path.join("/content/drive/MyDrive/Colab Notebooks/EDA_with_sports_data/Data_2", "*.csv")

In [153]:
# A list of all joined files
joined_list = glob.glob(joined_files)

# First version of the dataframe
steps = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

#Variables
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

**Data check**

In [154]:
# Checking the dimensions of the dataframe
steps.shape

(196, 3)

In [155]:
# Checking the types of data
steps.dtypes

Unnamed: 0    object
Bieżąca        int64
Cel            int64
dtype: object

In [156]:
# Samples from dataframe
steps.sample(5)

Unnamed: 0.1,Unnamed: 0,Bieżąca,Cel
161,7 cze,2170,10000
118,25 kwi,3611,10000
68,6 mar,4035,10000
113,20 kwi,8610,10000
99,6 kwi,3347,10000


In [157]:
steps.head(10)

Unnamed: 0.1,Unnamed: 0,Bieżąca,Cel
0,28 gru,5617,9220
1,29 gru,12020,9220
2,30 gru,3887,9220
3,31 gru,2762,9220
4,1 sty,4006,10000
5,2 sty,10672,10000
6,3 sty,3986,10000
7,4 sty,5205,10000
8,5 sty,5380,10000
9,6 sty,5576,10000


In [158]:
# Deleting rows from previous year
steps = steps.iloc[4:]
steps.head()

Unnamed: 0.1,Unnamed: 0,Bieżąca,Cel
4,1 sty,4006,10000
5,2 sty,10672,10000
6,3 sty,3986,10000
7,4 sty,5205,10000
8,5 sty,5380,10000


**Data cleaning**

In [159]:
# Deleting the last column in the dataframe
steps = steps.iloc[: , :-1]
steps.columns

Index(['Unnamed: 0', 'Bieżąca'], dtype='object')

In [160]:
# Renaming the column names
steps.rename(columns = {'Unnamed: 0':'Date','Bieżąca':'Steps'}, inplace = True)
steps.head()

Unnamed: 0,Date,Steps
4,1 sty,4006
5,2 sty,10672
6,3 sty,3986
7,4 sty,5205
8,5 sty,5380


In [161]:
# Converting the Date column into string
# Splitting the Date column into Day, Month and Year
# Filling 0 where needed in the Day column

steps['Date'] = steps['Date'].astype('str')
steps['Date'] = steps['Date'].str.strip()
new = steps["Date"].str.split(" ", n = 1, expand = True)
steps["Day"]= new[0]
steps["Month"]= new[1]
steps["Year"]= '2022'

n = 2
steps['Day'] = steps['Day'].str.zfill(n)
steps.head()

Unnamed: 0,Date,Steps,Day,Month,Year
4,1 sty,4006,1,sty,2022
5,2 sty,10672,2,sty,2022
6,3 sty,3986,3,sty,2022
7,4 sty,5205,4,sty,2022
8,5 sty,5380,5,sty,2022


In [162]:
# Replacing polish shortcuts of months names into appropriate numbers
steps['Month'] = steps['Month'].replace(['sty', 'lut', 'mar', 'kwi', 'maj', 'cze', 'lip', 'sie', 'wrz', 'paź', 'lis','gru'],['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'])
steps.head()

Unnamed: 0,Date,Steps,Day,Month,Year
4,1 sty,4006,1,1,2022
5,2 sty,10672,2,1,2022
6,3 sty,3986,3,1,2022
7,4 sty,5205,4,1,2022
8,5 sty,5380,5,1,2022


In [163]:
# Concatenating three columns (Day, Month, Year) into one (New_date)
steps['New_date'] = steps['Year'] + '-' + steps['Month'] + '-' + steps['Day']
#pd.to_datetime(steps2['New_date'])
steps.head()

Unnamed: 0,Date,Steps,Day,Month,Year,New_date
4,1 sty,4006,1,1,2022,2022-01-01
5,2 sty,10672,2,1,2022,2022-01-02
6,3 sty,3986,3,1,2022,2022-01-03
7,4 sty,5205,4,1,2022,2022-01-04
8,5 sty,5380,5,1,2022,2022-01-05


In [164]:
# Checking for duplicated dates
check = steps[steps.duplicated(['New_date'], keep=False)]

if check.empty:
  print("There are no duplicates.")
else:
  print("there are duplicates.")

There are no duplicates.


In [165]:
# Sorting dataframe by New_date column values
steps = steps.sort_values(by="New_date")
steps.head(5)

Unnamed: 0,Date,Steps,Day,Month,Year,New_date
4,1 sty,4006,1,1,2022,2022-01-01
5,2 sty,10672,2,1,2022,2022-01-02
6,3 sty,3986,3,1,2022,2022-01-03
7,4 sty,5205,4,1,2022,2022-01-04
8,5 sty,5380,5,1,2022,2022-01-05


In [166]:
# Changing type of New_data column
steps['New_date'] = [datetime.strptime(x,"%Y-%m-%d") for x in steps['New_date']]
print("Type of New_data column has been changed.")

Type of New_data column has been changed.


In [168]:
# Creating new columns: Weekday and Month and reseting the index
steps['Weekday'] = steps['New_date'].dt.day_name()
steps['Month'] = steps['New_date'].dt.month_name()
steps.reset_index(inplace=True)
steps.head()

Unnamed: 0,index,Date,Steps,Day,Month,Year,New_date,Weekday
0,4,1 sty,4006,1,January,2022,2022-01-01,Saturday
1,5,2 sty,10672,2,January,2022,2022-01-02,Sunday
2,6,3 sty,3986,3,January,2022,2022-01-03,Monday
3,7,4 sty,5205,4,January,2022,2022-01-04,Tuesday
4,8,5 sty,5380,5,January,2022,2022-01-05,Wednesday


In [173]:
# Assigning seasons to dates
winter_1 = (steps['New_date'] >= '2022-01-01') & (steps['New_date'] <= '2022-03-20')
winter_2 = (steps['New_date'] >= '2022-12-21')
spring = (steps['New_date'] >= '2022-03-21') & (steps['New_date'] <= '2022-06-21')
summer = (steps['New_date'] >= '2022-06-22') & (steps['New_date'] <= '2022-09-22')
autumn = (steps['New_date'] >= '2022-09-23') & (steps['New_date'] <= '2022-12-21')

steps.loc[winter_1,'Season'] ='Winter'
steps.loc[winter_2,'Season'] ='Winter'
steps.loc[spring,'Season'] ='Spring'
steps.loc[summer,'Season'] ='Summer'
steps.loc[autumn,'Season'] ='Autumn'

In [174]:
steps.head()

Unnamed: 0,index,Date,Steps,Day,Month,Year,New_date,Weekday,Season
0,4,1 sty,4006,1,January,2022,2022-01-01,Saturday,Winter
1,5,2 sty,10672,2,January,2022,2022-01-02,Sunday,Winter
2,6,3 sty,3986,3,January,2022,2022-01-03,Monday,Winter
3,7,4 sty,5205,4,January,2022,2022-01-04,Tuesday,Winter
4,8,5 sty,5380,5,January,2022,2022-01-05,Wednesday,Winter


In [175]:
steps.tail()

Unnamed: 0,index,Date,Steps,Day,Month,Year,New_date,Weekday,Season
187,191,7 lip,8651,7,July,2022,2022-07-07,Thursday,Summer
188,192,8 lip,15781,8,July,2022,2022-07-08,Friday,Summer
189,193,9 lip,13552,9,July,2022,2022-07-09,Saturday,Summer
190,194,10 lip,7368,10,July,2022,2022-07-10,Sunday,Summer
191,195,11 lip,8328,11,July,2022,2022-07-11,Monday,Summer


In [177]:
# Reordering the columns
df = steps[['New_date', 'Month', 'Weekday','Season','Steps']]
df.head()

Unnamed: 0,New_date,Month,Weekday,Season,Steps
0,2022-01-01,January,Saturday,Winter,4006
1,2022-01-02,January,Sunday,Winter,10672
2,2022-01-03,January,Monday,Winter,3986
3,2022-01-04,January,Tuesday,Winter,5205
4,2022-01-05,January,Wednesday,Winter,5380


In [200]:
# Checking if the length of dataframe is appropriate 

cy = datetime.date.today().year
cdy = (df['New_date'].iloc[-1]).timetuple().tm_yday

if cy != 2022:
  n = 365 
  if n-len(df) == 0:
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")
else:
  if cdy == len(df):
    print ("Valid number of days.")
  else:
    print ("Invalid number of days.")

Valid number of days.


In [203]:
if cdy != 365:
  df.to_csv('/content/drive/MyDrive/Colab Notebooks/EDA_with_sports_data/Outputs/Steps_2022_' + str(cdy) + '.csv')
else:
  df.to_csv('/content/drive/MyDrive/Colab Notebooks/EDA_with_sports_data/Outputs/Steps_2022.csv')
print("The file has been exported to the intended location.")

The file has been exported to the intended location.
