In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('max_pulse.csv')

In [6]:
data.shape

(169, 4)

In [3]:
data.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB


In [8]:
# removing null values
data.dropna()

# with (inplace=True) it updates original data frame

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [9]:
# fill null values
data.fillna(120)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [12]:
# fill values in specific column
data.fillna({'Calories':120})

# (inplace = True) updade it original dataframe

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [18]:
import numpy as np

# common ways to replace empty cells, is to calculate mean, median or mode
val1 = np.mean(data['Calories'])
val2 = np.median(data['Calories'])
val3 = data['Calories'].mode()[0]
print(val1)
print(val2)
print(val3)

375.79024390243904
nan
300.0


In [24]:
# wrong format of data
pd.to_datetime('20201226', format='mixed')

Timestamp('2020-12-26 00:00:00')

In [23]:
# if data have NaT value in data, it will consider as a null value
# data.dropna(subset=['Date'], inplace = True)

In [27]:
# clearing wrong data, in row 7 450 to 45
data.loc[7, 'Duration'] = 45
data.iloc[7]

Duration     45.0
Pulse       104.0
Maxpulse    134.0
Calories    253.3
Name: 7, dtype: float64

In [28]:
print(max(data['Duration']))

300


In [30]:
# bound the data with duration 120
for i in data.index:
    if data.loc[i,'Duration'] > 120:
        data.loc[i, 'Duration'] = 120

In [32]:
# remove rows which have duration greater than 120
for i in data.index:
    if data.loc[i, 'Duration'] > 120:
        data.drop(i, inplace=True)

In [33]:
# Removing Duplicates
data.duplicated().sum()

7

In [34]:
data.drop_duplicates().duplicated().sum()

0

In [40]:
# remove from particular columns
data.drop_duplicates(subset=['Duration'])

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
3,45,109,175,282.4
8,30,109,133,195.1
51,80,123,146,643.1
58,20,153,172,226.4
60,120,108,160,1376.0
72,90,100,127,700.0
93,15,80,100,50.5
97,25,152,168,244.2
167,75,120,150,320.4


In [42]:
# drop duplicates from multiple columns
data.drop_duplicates(subset=['Duration', 'Maxpulse'])

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
149,60,110,150,409.4
153,30,150,167,275.8
161,45,90,130,260.4
163,45,100,140,280.9


In [43]:
# drop_duplicates() keeps the first occurence of duplicate rows
# for removing those too we set parameter keep = False
data.drop_duplicates(keep=False)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4
