In [84]:
import pandas as pd
from google.colab import drive

#Loading the data into the drive 
drive.mount('/drive')
dc = pd.read_csv('/drive/MyDrive/data.csv')

# Showing basic statistical description of the data using the description() function
print(dc.describe())

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
         Duration       Pulse    Maxpulse     Calories
count  169.000000  169.000000  169.000000   164.000000
mean    63.846154  107.461538  134.047337   375.790244
std     42.299949   14.510259   16.450434   266.379919
min     15.000000   80.000000  100.000000    50.300000
25%     45.000000  100.000000  124.000000   250.925000
50%     60.000000  105.000000  131.000000   318.600000
75%     60.000000  111.000000  141.000000   387.600000
max    300.000000  159.000000  184.000000  1860.400000


In [86]:
# Check if the data has null values.
print('Are there any null values present in data: ',dc.isnull().values.any())
# Replace the null values with the mean
dc.fillna(dc.mean(),inplace=True)
print('Are there any null values after using fillna: ',dc.isnull().values.any())

Are there any null values present in data:  False
Are there any null values after using fillna:  False


In [88]:
# Select at least two columns and aggregate the data using: min, max, count, mean.
aggregat = dc.groupby('Duration').agg({'Calories':['mean','min','max','count']})
aggregat

Unnamed: 0_level_0,Calories,Calories,Calories,Calories
Unnamed: 0_level_1,mean,min,max,count
Duration,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
15,87.35,50.5,124.2,2
20,151.6,50.3,229.4,9
25,244.2,244.2,244.2,1
30,192.125,86.2,319.2,16
45,279.096585,100.7,406.0,35
60,341.046465,215.2,486.0,79
75,325.4,320.4,330.4,2
80,643.1,643.1,643.1,1
90,541.8,466.4,700.0,8
120,666.833333,500.0,1000.1,3


In [89]:
# Filter the dataframe to select the rows with calories values between 500 and 1000
dc[(dc['Calories']>=500) & (dc['Calories']<=1000)]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
51,80,123,146,643.1
62,160,109,135,853.0
65,180,90,130,800.4
66,150,105,135,873.4
67,150,107,130,816.0
72,90,100,127,700.0
73,150,97,127,953.2
75,90,98,125,563.2
78,120,100,130,500.4
83,120,100,130,500.0


In [90]:
# Filter the dataframe to select the rows with calories values > 500 and pulse < 100
dc[(dc['Calories']>500) & (dc['Pulse']<100)]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
65,180,90,130,800.4
70,150,97,129,1115.0
73,150,97,127,953.2
75,90,98,125,563.2
99,90,93,124,604.1
103,90,90,100,500.4
106,180,90,120,800.3
108,90,90,120,500.3


In [92]:
# Create a new “dc_modified” dataframe that contains all the columns from df except for “Maxpulse”
dc_modified = dc[['Duration', 'Pulse', 'Calories']]
dc_modified

Unnamed: 0,Duration,Pulse,Calories
0,60,110,409.1
1,60,117,479.0
2,60,103,340.0
3,45,109,282.4
4,45,117,406.0
...,...,...,...
164,60,105,290.8
165,60,110,300.0
166,60,115,310.2
167,75,120,320.4


In [93]:
# Delete the “Maxpulse” column from the main df dataframe
dc = dc.drop('Maxpulse', axis=1)
dc

Unnamed: 0,Duration,Pulse,Calories
0,60,110,409.1
1,60,117,479.0
2,60,103,340.0
3,45,109,282.4
4,45,117,406.0
...,...,...,...
164,60,105,290.8
165,60,110,300.0
166,60,115,310.2
167,75,120,320.4


In [94]:
# Convert the datatype of Calories column to int datatype
dc['Calories'] = dc['Calories'].astype('int64')
dc.dtypes

Duration    int64
Pulse       int64
Calories    int64
dtype: object