In [1]:
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


# Data Reading

In [2]:
#Reading data from a csv
df = pd.read_csv("data.csv")

In [3]:
#Print the top 5 values from the dataframe
df.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0


In [5]:
#Print the bottom 5 values from the dataframe
df.tail()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
27,60,'2020/12/27',92,118,241.0
28,60,'2020/12/28',103,132,
29,60,'2020/12/29',100,132,280.0
30,60,'2020/12/30',102,129,380.3
31,60,'2020/12/31',92,115,243.0


In [6]:
#Find the size of the dataframe (rows*columns)
df.shape

(32, 5)

In [7]:
#Get all the names of the columns
df.columns

Index(['Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories'], dtype='object')

In [8]:
#Accessing data from only a particular column
df["Pulse"]

0     110
1     117
2     103
3     109
4     117
5     102
6     110
7     104
8     109
9      98
10    103
11    100
12    100
13    106
14    104
15     98
16     98
17    100
18     90
19    103
20     97
21    108
22    100
23    130
24    105
25    102
26    100
27     92
28    103
29    100
30    102
31     92
Name: Pulse, dtype: int64

# Indexing

### Achieved by indexing the data frame using the iloc function just like slicing.
### df.iloc[row_start : row_end , col_start : col_end]

In [14]:
# SELECTING ONLY THE ROWS BASED ON THE INDEX
df.iloc[:8]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3


In [15]:
# SELECTING ONLY THE COLUMNS BASED ON THE INDEX

df.iloc[:,:3]

Unnamed: 0,Duration,Date,Pulse
0,60,'2020/12/01',110
1,60,'2020/12/02',117
2,60,'2020/12/03',103
3,45,'2020/12/04',109
4,45,'2020/12/05',117
5,60,'2020/12/06',102
6,60,'2020/12/07',110
7,450,'2020/12/08',104
8,30,'2020/12/09',109
9,60,'2020/12/10',98


In [21]:
# Indexing based on some condition
df[df["Pulse"] == 98]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
9,60,'2020/12/10',98,124,269.0
15,60,'2020/12/15',98,123,275.0
16,60,'2020/12/16',98,120,215.2


# Creating your own dataframe

In [23]:
data = {'name': ['Kunal', 'Ankit', 'Purva', 'Pranav', 'Sunil'], 
        'age': [42, 52, 36, 24, 73], 
        'maths': [4, 24, 31, 2, 3],
        'computers': [25, 94, 57, 62, 70]}

df = pd.DataFrame(data, columns = ['name', 'age', 'maths', 'computers'])
df

Unnamed: 0,name,age,maths,computers
0,Kunal,42,4,25
1,Ankit,52,24,94
2,Purva,36,31,57
3,Pranav,24,2,62
4,Sunil,73,3,70


In [25]:
# Find the sum of total marks scored in computers

print("Total sum of scores achieved in Computer = ", df['computers'].sum())
print("Total sum of scores achieved in Maths = ", df['maths'].sum())

Total sum of scores achieved in Computer =  308
Total sum of scores achieved in Maths =  64


In [26]:
#Find the mean of any column

print("Mean score of Maths = ", df['maths'].mean())
print("Mean score of Computers = ", df['computers'].mean())

Mean score of Maths =  12.8
Mean score of Computers =  61.6


In [27]:
# Get the cumulative sum of a column

df['computers'].cumsum()

0     25
1    119
2    176
3    238
4    308
Name: computers, dtype: int64

In [28]:
# Summary statistics on Computer

df['computers'].describe()

count     5.000000
mean     61.600000
std      24.905823
min      25.000000
25%      57.000000
50%      62.000000
75%      70.000000
max      94.000000
Name: computers, dtype: float64

In [29]:
# Find the minimum value from the column

df['maths'].min()

2

In [30]:
# Find the maximum value from the column

df['maths'].max()

31

In [31]:
# Find the median value of a column

df['computers'].median()

62.0

In [32]:
# Sample Variance of column values

df['computers'].var()

620.3

In [33]:
# Sample Standard Deviation column values

df['computers'].std()

24.90582261239327

In [36]:
# Skewness of Column values

df['computers'].skew()

-0.3916155077340477

In [37]:
# Kurtosis of Column values

df['computers'].kurt()

1.4271505503426738

In [40]:
# Correlation Matrix

df.iloc[: , 1:].corr()

Unnamed: 0,age,maths,computers
age,1.0,-0.105651,0.328852
maths,-0.105651,1.0,0.378039
computers,0.328852,0.378039,1.0


In [41]:
# Covariance of a matrix

df.iloc[:,1:].cov()

Unnamed: 0,age,maths,computers
age,340.8,-26.65,151.2
maths,-26.65,186.7,128.65
computers,151.2,128.65,620.3
