# Introduction

This notebook is taking a look at the raw data we have scraped so we can figure out what transformation and cleaning up we will need to do. In this notebook we won't go into too much detail on distributions and analytics but rather focus on making sure our data is usable.

In [34]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

In [2]:
RAW_DATA_PATH = "../data/raw/openpowerlifting.csv"

In [23]:
df = pd.read_csv(RAW_DATA_PATH)

In [24]:
print(df.columns)

Index(['Number', 'Name', 'Instagram Handle', 'Origin', 'Federation',
       'Competition Date', 'Competition Country', 'Competition City', 'Gender',
       'Equipment', 'Age', 'Weight', 'Class', 'Squat', 'Bench', 'Deadlift',
       'Total', 'Dots'],
      dtype='object')


In [25]:
print(df.head())

   Number               Name Instagram Handle  Origin Federation  \
0   28301      Brandon Homer              NaN     USA       USPA   
1   28302        Brian Knust              NaN     USA       USPA   
2   28303    Alina Sarycheva              NaN  Russia        FPR   
3   28304  Nancy Honeysuckle              NaN     USA       USPA   
4   28305      Hector Medina              NaN     USA       USPA   

  Competition Date Competition Country Competition City Gender Equipment  Age  \
0       2023-01-28                 USA               TX      M       Raw   45   
1       2015-08-22                 USA               TX      M     Wraps   31   
2       2017-02-08              Russia              NIZ      F       Raw  17~   
3       2021-07-31                 USA               TX      F       Raw   32   
4       2022-03-19                 USA               CA      M       Raw   22   

   Weight Class  Squat  Bench  Deadlift   Total    Dots  
0   215.4   220  556.6  396.8     575.4  1528.

In [26]:
print(f'there are {len(df)} records')

there are 451800 records


In [27]:
df.describe()

Unnamed: 0,Number,Weight,Squat,Bench,Deadlift,Total,Dots
count,451800.0,451800.0,326940.0,423897.0,364841.0,451800.0,451800.0
mean,225968.716025,184.053829,362.027168,243.775077,417.623612,828.80701,273.9777
std,130530.930731,47.13008,132.892094,99.467008,132.305954,433.83273,120.755537
min,1.0,35.0,2.2,10.3,2.2,52.5,33.69
25%,112950.75,148.8,253.5,154.3,308.6,450.0,158.22
50%,225900.5,179.6,358.2,248.0,424.4,804.7,303.35
75%,338850.25,213.4,451.9,314.1,513.6,1179.4,365.63
max,452800.0,573.2,1102.3,782.6,1030.6,2606.9,709.96


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451800 entries, 0 to 451799
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Number               451800 non-null  int64  
 1   Name                 451800 non-null  object 
 2   Instagram Handle     9183 non-null    object 
 3   Origin               265552 non-null  object 
 4   Federation           451800 non-null  object 
 5   Competition Date     451800 non-null  object 
 6   Competition Country  451800 non-null  object 
 7   Competition City     307999 non-null  object 
 8   Gender               451800 non-null  object 
 9   Equipment            451800 non-null  object 
 10  Age                  350687 non-null  object 
 11  Weight               451800 non-null  float64
 12  Class                447815 non-null  object 
 13  Squat                326940 non-null  float64
 14  Bench                423897 non-null  float64
 15  Deadlift         

After looking at the info above, the age column should be a float64 but it is not let us see why this is and turn it into a float

In [29]:
df["Age"].unique()

array(['45', '31', '17~', '32', '22', '43', '27', '30~', '28', '33', '18',
       '27~', '21', '32~', '20', '41', '29', '38~', '24', '39', '26~',
       '31~', '44~', '19', '25~', '26', '22~', nan, '40', '50~', '23',
       '23~', '18~', '20~', '34', '30', '36', '25', '28~', '29~', '34~',
       '44', '37', '42', '24~', '48', '39~', '35', '21~', '17', '19~',
       '38', '47', '54', '16~', '51~', '47~', '15~', '41~', '46', '40~',
       '60~', '35~', '45~', '55', '43~', '16', '37~', '59', '62', '52~',
       '56~', '14~', '49~', '60', '51', '48~', '36~', '15', '59~', '64',
       '79', '7', '53', '12', '10', '49', '14', '58', '9', '33~', '13',
       '56', '50', '11', '42~', '58~', '52', '57', '46~', '67', '13~',
       '61', '61~', '72', '54~', '63~', '57~', '73', '53~', '69~', '73~',
       '71', '81', '76', '66', '10~', '9~', '67~', '68~', '62~', '75',
       '11~', '12~', '63', '65~', '70', '65', '70~', '66~', '8', '68',
       '77', '75~', '74', '72~', '80', '64~', '7~', '69', '77

We can see that the data has some ~ lines indicating an approximation but we can remove that in order to convert the age column to floats 

In [35]:
df["Age"] = df['Age'].str.replace('~', '').astype(np.float64)

In [36]:
df["Age"].unique()

array([45., 31., 17., 32., 22., 43., 27., 30., 28., 33., 18., 21., 20.,
       41., 29., 38., 24., 39., 26., 44., 19., 25., nan, 40., 50., 23.,
       34., 36., 37., 42., 48., 35., 47., 54., 16., 51., 15., 46., 60.,
       55., 59., 62., 52., 56., 14., 49., 64., 79.,  7., 53., 12., 10.,
       58.,  9., 13., 11., 57., 67., 61., 72., 63., 73., 69., 71., 81.,
       76., 66., 68., 75., 65., 70.,  8., 77., 74., 80., 78., 84.,  4.,
       82., 88., 83.,  6.,  5., 92., 85.,  0., 89., 96., 86., 87., 93.,
        2., 90., 91.,  1.])

In [38]:
df.head()

Unnamed: 0,Number,Name,Instagram Handle,Origin,Federation,Competition Date,Competition Country,Competition City,Gender,Equipment,Age,Weight,Class,Squat,Bench,Deadlift,Total,Dots
0,28301,Brandon Homer,,USA,USPA,2023-01-28,USA,TX,M,Raw,45.0,215.4,220,556.6,396.8,575.4,1528.9,431.26
1,28302,Brian Knust,,USA,USPA,2015-08-22,USA,TX,M,Wraps,31.0,268.9,275,573.2,440.9,650.3,1664.5,431.25
2,28303,Alina Sarycheva,,Russia,FPR,2017-02-08,Russia,NIZ,F,Raw,17.0,119.9,125,264.5,237.0,303.1,804.7,431.25
3,28304,Nancy Honeysuckle,,USA,USPA,2021-07-31,USA,TX,F,Raw,32.0,180.5,181,385.8,209.4,425.5,1020.7,431.25
4,28305,Hector Medina,,USA,USPA,2022-03-19,USA,CA,M,Raw,22.0,163.8,165,429.9,303.1,584.2,1317.2,431.25


In [39]:
df_sorted = df.sort_values(by='Number')

In [40]:
df_sorted.head()

Unnamed: 0,Number,Name,Instagram Handle,Origin,Federation,Competition Date,Competition Country,Competition City,Gender,Equipment,Age,Weight,Class,Squat,Bench,Deadlift,Total,Dots
100,1,Marianna Gasparyan,power_mayan,USA,WRPF,2019-04-27,USA,CA,F,Wraps,30.0,123.4,123,573.2,292.1,485.0,1350.3,709.96
101,2,Kristy Hawkins,kristy_hawkins,USA,WRPF,2022-07-29,USA,VA,F,Wraps,41.0,163.8,165,639.3,336.2,611.7,1587.3,704.76
102,3,Hunter Henderson #1,huntermhenderson,USA,WRPF,2021-04-24,USA,CA,F,Wraps,26.0,163.8,165,650.3,325.1,573.2,1548.7,687.63
103,4,Chakera Ingram,c.c_ingram,USA,USPA,2019-08-03,USA,TX,F,Wraps,24.0,177.2,181,639.3,374.8,595.2,1609.3,686.14
104,5,Stefanie Cohen,steficohen,USA,WRPF,2019-04-27,USA,CA,F,Wraps,27.0,119.9,123,507.0,242.5,529.1,1278.6,685.45


Because the Number column isnt exactly in order we can remove it, and reset the index so the data is cleaner

In [41]:
df = df_sorted.drop("Number", axis=1).reset_index(drop=True)

In [42]:
df.head()

Unnamed: 0,Name,Instagram Handle,Origin,Federation,Competition Date,Competition Country,Competition City,Gender,Equipment,Age,Weight,Class,Squat,Bench,Deadlift,Total,Dots
0,Marianna Gasparyan,power_mayan,USA,WRPF,2019-04-27,USA,CA,F,Wraps,30.0,123.4,123,573.2,292.1,485.0,1350.3,709.96
1,Kristy Hawkins,kristy_hawkins,USA,WRPF,2022-07-29,USA,VA,F,Wraps,41.0,163.8,165,639.3,336.2,611.7,1587.3,704.76
2,Hunter Henderson #1,huntermhenderson,USA,WRPF,2021-04-24,USA,CA,F,Wraps,26.0,163.8,165,650.3,325.1,573.2,1548.7,687.63
3,Chakera Ingram,c.c_ingram,USA,USPA,2019-08-03,USA,TX,F,Wraps,24.0,177.2,181,639.3,374.8,595.2,1609.3,686.14
4,Stefanie Cohen,steficohen,USA,WRPF,2019-04-27,USA,CA,F,Wraps,27.0,119.9,123,507.0,242.5,529.1,1278.6,685.45


Now we will go through all of the columns to make sure that the Data looks okay

In [43]:
df["Origin"].unique()

array(['USA', 'Australia', 'Russia', 'Canada', 'Germany', nan,
       'South Africa', 'England', 'Czechia', 'Finland', 'France', 'UK',
       'Italy', 'Ukraine', 'Wales', 'Argentina', 'Kazakhstan', 'Poland',
       'New Zealand', 'South Korea', 'Israel', 'Ireland',
       'US Virgin Islands', 'Greece', 'Brazil', 'Bulgaria', 'Iran',
       'Colombia', 'Switzerland', 'Sweden', 'Japan', 'Thailand', 'Latvia',
       'Azerbaijan', 'Hungary', 'Türkiye', 'Norway', 'Taiwan', 'Iceland',
       'Algeria', 'Spain', 'Egypt', 'Belgium', 'Mexico', 'Belarus',
       'Nauru', 'Libya', 'Netherlands', 'Slovakia', 'Guyana', 'Chile',
       'Croatia', 'Serbia', 'Portugal', 'Georgia', 'Singapore',
       'British Virgin Islands', 'China', 'Lithuania', 'Scotland',
       'Bolivia', 'Ecuador', 'Slovenia', 'Austria', 'Indonesia',
       'Jamaica', 'UAE', 'Estonia', 'Vietnam', 'Venezuela', 'Cyprus',
       'Denmark', 'Luxembourg', 'Kyrgyzstan', 'Belize',
       'Bosnia and Herzegovina', 'Philippines', 'Cameroo

In [44]:
df["Federation"].unique()

array(['WRPF', 'USPA', 'XPC', 'USA-UA', 'RPS', 'WRPF-AUS', 'SPF', 'FPO',
       'WPC-SA', 'IPA', 'APF', 'ProRaw', 'UPA', 'USAPL', 'RUPC', 'AusPL',
       'GPC-GUPU', '365Strong', 'IPF', 'BPU', 'IPL', 'WPA-RUS', 'USPC',
       'BB', 'WRPF-CAN', 'APA', 'IrishPO', 'XPS', 'RhinoPC', 'GPC',
       'UPC-Germany', 'WPC-Finland', 'GPC-AUS', 'NAP', 'Ireland-UA',
       'CommonwealthPF', 'MM', 'EPF', 'NPA', 'FPR', 'AMP', 'ARPL', 'AAU',
       'WPC', 'WPUF', 'GPA', 'CONBRAP', 'CAPO', 'NSF', 'GPC-GB',
       'WPC-RUS', 'FIPL', 'NAPF', 'RPU', 'SSF', 'CPF', 'JPA', 'SVNL',
       'AsianPF', 'WPC-Latvia', 'WPAU', 'XPC-Poland', 'PA', 'CPL',
       'TPSSF', 'BP', 'RAWU', 'CPA', 'GPC-NZ', 'USPF', 'Hardcore',
       'WRPF-Iceland', 'IKF', 'KPF', 'WUAP-SVK', 'USVIPF', 'GPC-CAN',
       'APC', 'NZPF', 'GPC-RUS', 'WPPL-Belarus', 'SAPF', 'HPC',
       'WRPF-Belarus', 'WRPF-Ireland', 'WPF-RUS', 'WRPF-Spain', 'FFForce',
       'FESUPO', 'URPF', 'NASA', 'CPU', 'WPC-Poland', 'IrishPF', 'WDFPF',
       'WRPF-Bulga

In [45]:
df["Competition Date"]

0         2019-04-27
1         2022-07-29
2         2021-04-24
3         2019-08-03
4         2019-04-27
             ...    
451795    2016-02-05
451796    2013-06-08
451797    2017-04-20
451798    2020-02-15
451799    2020-12-25
Name: Competition Date, Length: 451800, dtype: object

We can see that this is stored as a object but it may be useful to store as Datetime type

In [46]:
df["Date"] = pd.to_datetime(df['Competition Date'])

In [51]:
df["Date"].dtype

dtype('<M8[ns]')

In [53]:
df.head()

Unnamed: 0,Name,Instagram Handle,Origin,Federation,Competition Date,Competition Country,Competition City,Gender,Equipment,Age,Weight,Class,Squat,Bench,Deadlift,Total,Dots,Date
0,Marianna Gasparyan,power_mayan,USA,WRPF,2019-04-27,USA,CA,F,Wraps,30.0,123.4,123,573.2,292.1,485.0,1350.3,709.96,2019-04-27
1,Kristy Hawkins,kristy_hawkins,USA,WRPF,2022-07-29,USA,VA,F,Wraps,41.0,163.8,165,639.3,336.2,611.7,1587.3,704.76,2022-07-29
2,Hunter Henderson #1,huntermhenderson,USA,WRPF,2021-04-24,USA,CA,F,Wraps,26.0,163.8,165,650.3,325.1,573.2,1548.7,687.63,2021-04-24
3,Chakera Ingram,c.c_ingram,USA,USPA,2019-08-03,USA,TX,F,Wraps,24.0,177.2,181,639.3,374.8,595.2,1609.3,686.14,2019-08-03
4,Stefanie Cohen,steficohen,USA,WRPF,2019-04-27,USA,CA,F,Wraps,27.0,119.9,123,507.0,242.5,529.1,1278.6,685.45,2019-04-27


In [54]:
df["Competition Country"].unique()

array(['USA', 'Australia', 'Finland', 'South Africa', 'Russia', 'Ukraine',
       'England', 'UK', 'Malta', 'Canada', 'Ireland', 'Wales',
       'Argentina', 'Germany', 'New Zealand', 'South Korea', 'Sweden',
       'Israel', 'Brazil', 'Norway', 'Colombia', 'Italy', 'Kyrgyzstan',
       'Poland', 'Kazakhstan', 'Hungary', 'Japan', 'UAE', 'Latvia',
       'Czechia', 'Türkiye', 'Slovakia', 'Iceland', 'Luxembourg',
       'Portugal', 'US Virgin Islands', 'Belarus', 'Serbia', 'Lithuania',
       'Spain', 'France', 'Uruguay', 'Bulgaria', 'Morocco', 'Chile',
       'Austria', 'China', 'Mexico', 'Egypt', 'Greece', 'India',
       'Bosnia and Herzegovina', 'Denmark', 'Azerbaijan', 'Scotland',
       'Netherlands', 'Jamaica', 'Malaysia', 'Belgium', 'Slovenia',
       'Costa Rica', 'Iran', 'Belize', 'Georgia', 'Qatar', 'Estonia',
       'Kuwait', 'Mongolia', 'Bolivia', 'Cameroon', 'Peru', 'Croatia',
       'Switzerland', 'Singapore', 'Vietnam', 'USSR', 'Ecuador',
       'N.Ireland', 'Uganda', 'Ho

In [55]:
df["Competition City"].unique()

array(['CA', 'VA', 'TX', 'OH', 'NC', 'FL', 'NSW', nan, 'GT', 'MO', 'NY',
       'OK', 'VIC', 'IA', 'TN', 'PA', 'MOW', 'IL', 'QLD', 'MD', 'NJ',
       'KHA', 'AR', 'AB', 'KY', 'UT', 'SN', 'RYA', 'KYA', 'NV', 'IN',
       'NIZ', 'KRS', 'TAS', 'OR', 'SC', 'SA', 'ROS', 'QC', 'KS', 'MI',
       'LEN', 'HI', 'MS', 'VLA', 'MA', 'GA', 'LA', 'WI', 'AL', 'WA',
       'NVS', 'ON', 'AZ', 'NH', 'SVE', 'WV', 'VGG', 'NE', 'WKO', 'PNZ',
       'CT', 'KIR', 'AKL', 'KDA', 'BC', 'TUL', 'CO', 'ME', 'MV', 'VT',
       'ACT', 'NM', 'BE', 'STA', 'BA', 'CHE', 'HUN', 'RI', 'MN', 'NB',
       'ND', 'MB', 'JL', 'NS', 'AK', 'WGN', 'ULY', 'VLG', 'TYU', 'WG',
       'ARK', 'CRE', 'RM', 'MT', 'CN', 'PE', 'ID', 'ORL', 'NRW', 'YAR',
       'AMU', 'CAN', 'HKB', 'BW', 'CB', 'MWT', 'BRY', 'IRK', 'CM', 'SAR',
       'SD', 'NL', 'DE', 'OMS', 'NTL', 'WM', 'WY', 'OTA', 'VOR', 'KLU',
       'ZAB', 'NW', 'PRI', 'KGD', 'SK', 'SE', 'TAM', 'SH', 'SAM', 'GD',
       'RS', 'DC', 'BOP', 'SP', 'DF', 'PER', 'ATT', 'PEL', 'ALT', 'CQ',


In [56]:
df["Gender"].unique()

array(['F', 'M', 'Mx'], dtype=object)

In [57]:
df["Equipment"].unique()

array(['Wraps', 'Raw'], dtype=object)

One thing to note is that some of the columns have spaces in them and this could become a problem with certain plugins/databases

In [58]:
df.columns

Index(['Name', 'Instagram Handle', 'Origin', 'Federation', 'Competition Date',
       'Competition Country', 'Competition City', 'Gender', 'Equipment', 'Age',
       'Weight', 'Class', 'Squat', 'Bench', 'Deadlift', 'Total', 'Dots',
       'Date'],
      dtype='object')

In [59]:
df.columns = df.columns.str.replace(' ', '_')

In [62]:
print(df.columns)

Index(['Name', 'Instagram_Handle', 'Origin', 'Federation', 'Competition_Date',
       'Competition_Country', 'Competition_City', 'Gender', 'Equipment', 'Age',
       'Weight', 'Class', 'Squat', 'Bench', 'Deadlift', 'Total', 'Dots',
       'Date'],
      dtype='object')


# Conclusion
Everything now looks pretty good, let us now implement this in a proper python script so it will be easier to refactor and save the processed data in a new location where we can then do further analysis