## **Pandas**

Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.

Series [1D]

DataFrame [2D]

Panel [df in df]

In [1]:
import pandas as pd

In [3]:
#check version
pd.__version__

'1.5.3'

## pd.Series(data,index,dtype)

One-dimensional ndarray with axis labels (including time series).


In [4]:
#empty Series
pd.Series()

  pd.Series()


Series([], dtype: float64)

In [5]:
pd.Series?

In [6]:
pd.Series(data=None,index=None,dtype=None)

  pd.Series(data=None,index=None,dtype=None)


Series([], dtype: float64)

In [7]:
# The default dtype for empty Series will be 'object'
pd.Series(dtype=object)

Series([], dtype: object)

## Features of a Series
Its Mutable in nature

Array, indexing, slicing default

1 Dimensional data structure

we can manipulate Series

duplicate data and index is allowed


In [8]:
#Deal with data
k = [10,23.44,'python',560000]
k

[10, 23.44, 'python', 560000]

In [9]:
k = pd.Series(k)
k

0        10
1     23.44
2    python
3    560000
dtype: object

In [10]:
# int--> float-->complex--> str/object
pd.Series([1,'2',3.])

0      1
1      2
2    3.0
dtype: object

In [12]:
#upcasting :Python automatically converts one data type to another data type.
pd.Series([1,2,3,4.])


0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [13]:
a = pd.Series([1,2,3])
a.__sizeof__()

152

In [14]:
b = pd.Series(['1','2','3'])
b.__sizeof__()

302

In [15]:
a.dtype,b.dtype

(dtype('int64'), dtype('O'))

# Check the properties of Series

In [16]:
k

0        10
1     23.44
2    python
3    560000
dtype: object

In [18]:
#check dimension
k.ndim

1

In [19]:
#check total no. of elements(rows)
k.size

4

In [20]:
len(k)

4

In [21]:
k.count()

4

In [22]:
import numpy as np
t = pd.Series([np.nan,1,2,np.nan])
t
#NaN: Not a Number

0    NaN
1    1.0
2    2.0
3    NaN
dtype: float64

In [23]:
t.count()
# Return number of non-NA/null observations in the Series.

2

In [24]:
t.size# it check for all values NaN as well

4

In [26]:
# check number of rows and columns
# as Series is 1D so we dont have columns
t.shape

(4,)

In [28]:
# check data type
t.dtype

dtype('float64')

In [30]:
k.dtype

dtype('O')

In [31]:
# check memory usage in bytes
k.memory_usage()

160

In [32]:
k

0        10
1     23.44
2    python
3    560000
dtype: object

In [33]:
#fetch data only
k.values

array([10, 23.44, 'python', 560000], dtype=object)

In [34]:
#fetch index
k.index

RangeIndex(start=0, stop=4, step=1)

#  Apply some statistical or Mathematical operations

In [35]:
a = pd.Series([120,1,9000,22,18])
a

0     120
1       1
2    9000
3      22
4      18
dtype: int64

In [36]:
# find out max value
a.max()

9000

In [37]:
max(a)

9000

In [38]:
# find out min
a.min()

1

In [39]:
min(a)

1

In [40]:
# find out mean/average
a.mean()

1832.2

In [42]:
a

0     120
1       1
2    9000
3      22
4      18
dtype: int64

In [44]:
# median
#[4,10,2,1,5,7]--> sort records --> [1,2,4,7,10]--> middle value-->4
a.median()

22.0

In [45]:
# mode: most frquent value
a.mode()

0       1
1      18
2      22
3     120
4    9000
dtype: int64

In [46]:
b = pd.Series([100,230,450,100,100])
b

0    100
1    230
2    450
3    100
4    100
dtype: int64

In [47]:
# 100 presents more times
b.mode()

0    100
dtype: int64

In [48]:
type(a)

In [50]:
k.value_counts()

10        1
23.44     1
python    1
560000    1
dtype: int64

In [51]:
n = pd.Series(['Mangesh','Roshni','Roshni','Suvarna'])
n

0    Mangesh
1     Roshni
2     Roshni
3    Suvarna
dtype: object

In [52]:
n.mode()

0    Roshni
dtype: object

In [53]:
# when we want categorywise count then use value_counts()
n.value_counts()

Roshni     2
Mangesh    1
Suvarna    1
dtype: int64

In [54]:
# check unique records
n.unique()

array(['Mangesh', 'Roshni', 'Suvarna'], dtype=object)

In [55]:
# count unique records
n.nunique()

3

# append()

In [56]:
a,k

(0     120
 1       1
 2    9000
 3      22
 4      18
 dtype: int64,
 0        10
 1     23.44
 2    python
 3    560000
 dtype: object)

In [57]:
a.append(k)

  a.append(k)


0       120
1         1
2      9000
3        22
4        18
0        10
1     23.44
2    python
3    560000
dtype: object

In [58]:
a.append(k,ignore_index=True)

  a.append(k,ignore_index=True)


0       120
1         1
2      9000
3        22
4        18
5        10
6     23.44
7    python
8    560000
dtype: object

# apply()

In [59]:
a

0     120
1       1
2    9000
3      22
4      18
dtype: int64

In [60]:
# add 1000 in each num from a Series
a.apply(lambda num:num+1000)

0     1120
1     1001
2    10000
3     1022
4     1018
dtype: int64

In [61]:
a # changes are temp.

0     120
1       1
2    9000
3      22
4      18
dtype: int64

In [62]:
def square(num):
  return num**2

a.apply(square)

0       14400
1           1
2    81000000
3         484
4         324
dtype: int64

In [63]:
n

0    Mangesh
1     Roshni
2     Roshni
3    Suvarna
dtype: object

In [64]:
n.apply(lambda name:name.upper())

0    MANGESH
1     ROSHNI
2     ROSHNI
3    SUVARNA
dtype: object

# astype(): for typecasting

The astype() method returns a new DataFrame where the data types has been changed to the specified type.

In [65]:
import pandas as pd
#pd.Series(data,index,dtype)
a = pd.Series([20,3400,506,1,23,30])
a

0      20
1    3400
2     506
3       1
4      23
5      30
dtype: int64

In [66]:
#change data type of Series 'a'
a.astype('float')

0      20.0
1    3400.0
2     506.0
3       1.0
4      23.0
5      30.0
dtype: float64

In [67]:
r = pd.Series(['10','20','30'])
r

0    10
1    20
2    30
dtype: object

In [68]:
r.mean()

34010.0

In [70]:
# in above case there is a problem in answer
# hence we need conversion to int
# so astype will convert from object  to int
r.astype('int').mean()

20.0

# describe()

Generate descriptive statistics.

returns statistical information

In [71]:
# describe works on numeric data default
nm = pd.Series(['amit','amol','akshay','amol','amol'])
nm

0      amit
1      amol
2    akshay
3      amol
4      amol
dtype: object

In [72]:
nm.describe()

count        5
unique       3
top       amol
freq         3
dtype: object

In [73]:
a

0      20
1    3400
2     506
3       1
4      23
5      30
dtype: int64

In [74]:
a.describe()

count       6.000000
mean      663.333333
std      1354.828353
min         1.000000
25%        20.750000
50%        26.500000
75%       387.000000
max      3400.000000
dtype: float64

# diff()

calculates difference between consecutive numbers (next_num - prev_num)

In [75]:
a

0      20
1    3400
2     506
3       1
4      23
5      30
dtype: int64

In [76]:
a.diff()

0       NaN
1    3380.0
2   -2894.0
3    -505.0
4      22.0
5       7.0
dtype: float64

In [77]:
d = pd.Series([7,7,7,5,5,5,5,5,5])
d

0    7
1    7
2    7
3    5
4    5
5    5
6    5
7    5
8    5
dtype: int64

In [78]:
d.diff()

0    NaN
1    0.0
2    0.0
3   -2.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
dtype: float64

# work on index

In [79]:
# At the time of creation
pd.Series([10,20,30,40,50])
# default index starts from 0 and stop at n-1

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [80]:
# pd.Series(data,index,dtype)
pd.Series([10,20,30,40,50],['A','B','C','D','E'])

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [81]:
#change sequence
pd.Series(['A','B','C','D','E'],[10,20,30,40,50])

10    A
20    B
30    C
40    D
50    E
dtype: object

In [82]:
#change sequence
pd.Series(index=['A','B','C','D','E'],data=[10,20,30,40,50])

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [83]:
dt,index = [10,20,30,40,50],['A','B','C','D','E']
pd.Series(dt,index)

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [84]:
import numpy as np
y = pd.Series(dt,np.arange(21,26))
y

21    10
22    20
23    30
24    40
25    50
dtype: int64

# if we want to change an index of already created Series

In [85]:
y

21    10
22    20
23    30
24    40
25    50
dtype: int64

In [86]:
y.reset_index()  #temp

Unnamed: 0,index,0
0,21,10
1,22,20
2,23,30
3,24,40
4,25,50


In [87]:
y.reset_index(drop=True)
# drop =  True means it will drop index column generated and returns Series

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [88]:
y

21    10
22    20
23    30
24    40
25    50
dtype: int64