In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/rafeshankar/Powerlifting-Analysis/main/powerlifting_dataset.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Lifter Name         3000 non-null   object
 1   Age                 3000 non-null   int64 
 2   Weight Class        3000 non-null   object
 3   Lift Type           3000 non-null   object
 4   Amount Lifted (kg)  3000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 117.3+ KB


In [5]:
df.head()

Unnamed: 0,Lifter Name,Age,Weight Class,Lift Type,Amount Lifted (kg)
0,Jessica Wilson,46,59 kg,Bench Press,269
1,John Doe,60,83 kg,Bench Press,179
2,Emily Davis,41,105 kg,Bench Press,235
3,Emily Davis,33,66 kg,Squat,359
4,Laura Taylor,56,74 kg,Deadlift,221


##identify duplicates

In [6]:
df.duplicated().sum()

1

In [7]:
df[df.duplicated()]

Unnamed: 0,Lifter Name,Age,Weight Class,Lift Type,Amount Lifted (kg)
2229,Michael Johnson,22,93 kg,Deadlift,116


In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.duplicated().sum()

0

##identify null values

In [10]:
df.isnull().sum()

Unnamed: 0,0
Lifter Name,0
Age,0
Weight Class,0
Lift Type,0
Amount Lifted (kg),0


##String Cleaning

In [11]:
df['Weight Class'] = df['Weight Class'].str.replace('kg','')

In [12]:
df['Weight Class'].value_counts()

Unnamed: 0_level_0,count
Weight Class,Unnamed: 1_level_1
93,358
Open,356
59,348
83,327
66,326
52,326
74,321
120,319
105,318


In [13]:
df['Weight Class'].value_counts()

Unnamed: 0_level_0,count
Weight Class,Unnamed: 1_level_1
93,358
Open,356
59,348
83,327
66,326
52,326
74,321
120,319
105,318


In [14]:
open = df[df['Weight Class'] == 'Open']
open

Unnamed: 0,Lifter Name,Age,Weight Class,Lift Type,Amount Lifted (kg)
34,Emily Davis,35,Open,Squat,376
42,John Doe,49,Open,Deadlift,217
52,Jessica Wilson,20,Open,Squat,133
56,Chris Brown,51,Open,Squat,226
66,Emily Davis,40,Open,Deadlift,365
...,...,...,...,...,...
2977,Jessica Wilson,45,Open,Squat,147
2979,Jane Smith,41,Open,Bench Press,175
2990,Laura Taylor,23,Open,Deadlift,287
2996,Daniel Lee,39,Open,Deadlift,244


In [15]:
df['Weight Class'] = df['Weight Class'].replace('Open', 200)

In [16]:
df['Weight Class'].value_counts()

Unnamed: 0_level_0,count
Weight Class,Unnamed: 1_level_1
93,358
200,356
59,348
83,327
66,326
52,326
74,321
120,319
105,318


In [17]:
df['Weight Class'] = df['Weight Class'].astype(int)

In [18]:
df = df.rename(columns={'Amount Lifted (kg)':'Amount Lifted'})

In [19]:
df = pd.get_dummies(df, columns=['Lift Type'])

In [20]:
df.columns = pd.Series(df.columns).str.lower().str.strip().str.replace(' ','_')

In [21]:
df['lifter_name'].value_counts().index.tolist()

['Emily Davis',
 'Jessica Wilson',
 'Jane Smith',
 'Chris Brown',
 'Laura Taylor',
 'John Doe',
 'Michael Johnson',
 'Daniel Lee',
 'Matthew Anderson',
 'Sarah Thomas']

In [22]:
name_to_gender = {'Emily Davis': 'female',
 'Jessica Wilson': 'female',
 'Jane Smith': 'female',
 'Chris Brown': 'male',
 'Laura Taylor': 'female',
 'John Doe': 'male',
 'Michael Johnson': 'male',
 'Daniel Lee': 'male',
 'Matthew Anderson': 'male',
 'Sarah Thomas': 'female'}

In [23]:
df['gender'] = df['lifter_name'].map(name_to_gender)

In [24]:
df.head()

Unnamed: 0,lifter_name,age,weight_class,amount_lifted,lift_type_bench_press,lift_type_deadlift,lift_type_squat,gender
0,Jessica Wilson,46,59,269,True,False,False,female
1,John Doe,60,83,179,True,False,False,male
2,Emily Davis,41,105,235,True,False,False,female
3,Emily Davis,33,66,359,False,False,True,female
4,Laura Taylor,56,74,221,False,True,False,female


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2999 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   lifter_name            2999 non-null   object
 1   age                    2999 non-null   int64 
 2   weight_class           2999 non-null   int64 
 3   amount_lifted          2999 non-null   int64 
 4   lift_type_bench_press  2999 non-null   bool  
 5   lift_type_deadlift     2999 non-null   bool  
 6   lift_type_squat        2999 non-null   bool  
 7   gender                 2999 non-null   object
dtypes: bool(3), int64(3), object(2)
memory usage: 149.4+ KB


In [26]:
df.shape

(2999, 8)

In [27]:
df.to_csv('cleaned_data', index=False)