In [265]:
import pandas as pd

df = pd.read_csv('dirtydata.csv')
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [32]:
df.dtypes

Duration      int64
Date         object
Pulse         int64
Maxpulse      int64
Calories    float64
dtype: object

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


In [266]:
# Checking null values
df.isnull().sum()

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    2
dtype: int64

In [267]:
# Drop missing values
df = df.dropna()
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [268]:
# Removing duplicates
df.drop_duplicates(inplace = True)
print(df.count())

Duration    28
Date        28
Pulse       28
Maxpulse    28
Calories    28
dtype: int64


In [269]:
# Fixing the date format
df['Date'] = pd.to_datetime(df['Date'])
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [270]:
# Set date as index
df.set_index('Date',inplace=True)
df

Unnamed: 0_level_0,Duration,Pulse,Maxpulse,Calories
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-12-01,60,110,130,409.1
2020-12-02,60,117,145,479.0
2020-12-03,60,103,135,340.0
2020-12-04,45,109,175,282.4
2020-12-05,45,117,148,406.0
2020-12-06,60,102,127,300.0
2020-12-07,60,110,136,374.0
2020-12-08,450,104,134,253.3
2020-12-09,30,109,133,195.1
2020-12-10,60,98,124,269.0


In [271]:
# Fix missing dates
idx = pd.date_range('12-01-2020', '12-31-2020')
df.index = pd.DatetimeIndex(df.index)
df = df.reindex(idx, fill_value=0)
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
2020-12-01,60,110,130,409.1
2020-12-02,60,117,145,479.0
2020-12-03,60,103,135,340.0
2020-12-04,45,109,175,282.4
2020-12-05,45,117,148,406.0
2020-12-06,60,102,127,300.0
2020-12-07,60,110,136,374.0
2020-12-08,450,104,134,253.3
2020-12-09,30,109,133,195.1
2020-12-10,60,98,124,269.0


In [272]:
# Replacing missing values using median
df['Duration'] = df['Duration'].replace(0, value = df['Duration'].median())
df['Pulse'] = df['Pulse'].replace(0, value = df['Pulse'].median())
df['Maxpulse'] = df['Maxpulse'].replace(0, value = df['Maxpulse'].median())
df['Calories'] = df['Calories'].replace(0, value = df['Calories'].median())
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
2020-12-01,60,110,130,409.1
2020-12-02,60,117,145,479.0
2020-12-03,60,103,135,340.0
2020-12-04,45,109,175,282.4
2020-12-05,45,117,148,406.0
2020-12-06,60,102,127,300.0
2020-12-07,60,110,136,374.0
2020-12-08,450,104,134,253.3
2020-12-09,30,109,133,195.1
2020-12-10,60,98,124,269.0


In [256]:
# Fixing duration
df[~df['Duration'].isin([30, 45, 60])]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
2020-12-08,450,104,134,253.3


In [258]:
df['Duration'] = df['Duration'].replace(450, 45)
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
2020-12-01,60,110,130,409.1
2020-12-02,60,117,145,479.0
2020-12-03,60,103,135,340.0
2020-12-04,45,109,175,282.4
2020-12-05,45,117,148,406.0
2020-12-06,60,102,127,300.0
2020-12-07,60,110,136,374.0
2020-12-08,45,104,134,253.3
2020-12-09,30,109,133,195.1
2020-12-10,60,98,124,269.0


In [259]:
# Saving clean data
df.to_csv("clean_data.csv")