# Introduction to Pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
print(pd.__version__)

1.3.2


In [3]:
print(dir(pd))

['BooleanDtype', 'Categorical', 'CategoricalDtype', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'DatetimeTZDtype', 'ExcelFile', 'ExcelWriter', 'Flags', 'Float32Dtype', 'Float64Dtype', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'IndexSlice', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'Int64Index', 'Int8Dtype', 'Interval', 'IntervalDtype', 'IntervalIndex', 'MultiIndex', 'NA', 'NaT', 'NamedAgg', 'Period', 'PeriodDtype', 'PeriodIndex', 'RangeIndex', 'Series', 'SparseDtype', 'StringDtype', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'UInt64Index', 'UInt8Dtype', '__builtins__', '__cached__', '__doc__', '__docformat__', '__file__', '__getattr__', '__git_version__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_hashtable', '_is_numpy_dev', '_lib', '_libs', '_np_version_under1p18', '_testing', '_tslib', '_typing', '_version', 'api', 'array', 'arrays', 'bdate_range', 'compat', 'con

# - Pandas Series

### empty series

In [4]:
s = pd.Series()
s

  s = pd.Series()


Series([], dtype: float64)

### series with list/tuple data

In [5]:
l = [1,2,3,4]
s = pd.Series(l)
s

0    1
1    2
2    3
3    4
dtype: int64

In [6]:
l = (1,2,3,4)
s = pd.Series(l)
s

0    1
1    2
2    3
3    4
dtype: int64

In [7]:
l = (1,2,3,"4")
s = pd.Series(l)
s

0    1
1    2
2    3
3    4
dtype: object

### series using numpy array

In [8]:
a = np.array([1,2,3,4])
print(a)

[1 2 3 4]


In [9]:
s = pd.Series(a)
s

0    1
1    2
2    3
3    4
dtype: int64

### series using dictionary

In [10]:
d = {'A':1,"B":2,"C":3}
print(d)

{'A': 1, 'B': 2, 'C': 3}


In [11]:
s = pd.Series(d)
s

A    1
B    2
C    3
dtype: int64

### series using index names

In [12]:
l = [1,2,3,4]
s = pd.Series(l,index=["A","B","C","D"])
s

A    1
B    2
C    3
D    4
dtype: int64

In [13]:
# l = [1,2,3,4]
# s = pd.Series(l,index=["A","B","C"])
# s

In [14]:
# l = [1,2,3,4]
# s = pd.Series(l,index=["A","B","C","D","E"])
# s

In [15]:
# l = [1]
# s = pd.Series(l,index=["A","B"])
# s

### series with scalar values

In [16]:
l = 40
s = pd.Series(l,index=["A","B","C"])
s

A    40
B    40
C    40
dtype: int64

In [17]:
l = 40
s = pd.Series(l,index=["A","B","C"],dtype="int8")
s

A    40
B    40
C    40
dtype: int8

## Indexing/Slicing for Series

In [18]:
a = [10,20,30,40,50]
s = pd.Series(a)
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [19]:
s[0]

10

In [20]:
s[3]

40

In [21]:
a = [10,20,30,40,50]
s = pd.Series(a,index=list("ABCDE"))
s

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [22]:
s["A"]

10

In [23]:
s[0]

10

In [24]:
s[["A","C","D"]]

A    10
C    30
D    40
dtype: int64

In [25]:
s[2:]

C    30
D    40
E    50
dtype: int64

In [26]:
s[2:4]

C    30
D    40
dtype: int64

In [27]:
s["A":]

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [28]:
s["A":'D']

A    10
B    20
C    30
D    40
dtype: int64

# Data Frames

## Empty DataFrame

In [29]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


## Using Default option to create DataFrame

In [30]:
df = pd.DataFrame([1,2,3])
df

Unnamed: 0,0
0,1
1,2
2,3


## DataFrame using List/Tuple

In [31]:
a = [1,2,3,4]
df = pd.DataFrame(a)
df

Unnamed: 0,0
0,1
1,2
2,3
3,4


## DataFrame using numpy array

In [32]:
a = np.random.randint(1,10,15).reshape(5,3)
a

array([[8, 7, 9],
       [3, 2, 2],
       [2, 7, 5],
       [2, 1, 2],
       [2, 2, 9]])

In [33]:
df = pd.DataFrame(a)
df

Unnamed: 0,0,1,2
0,8,7,9
1,3,2,2
2,2,7,5
3,2,1,2
4,2,2,9


In [34]:
df = pd.DataFrame(a,columns=list("ABC"))
df

Unnamed: 0,A,B,C
0,8,7,9
1,3,2,2
2,2,7,5
3,2,1,2
4,2,2,9


## DataFrame using Dictionary

In [35]:
d = {"A":1,"B":2,"C":3}
df = pd.DataFrame(d,index=[100,101,102])
df

Unnamed: 0,A,B,C
100,1,2,3
101,1,2,3
102,1,2,3


## Attributes and Methods of a DataFrame

In [36]:
a = np.random.randint(1,20,30).reshape(5,6)
a

array([[ 8,  8, 17,  7, 14,  9],
       [10, 16, 15,  4,  9,  2],
       [ 9, 10,  7,  4, 15,  6],
       [18, 15, 12,  6,  9, 15],
       [18, 10,  7,  1, 14, 11]])

In [37]:
df = pd.DataFrame(a,columns=list("ABCDEF"))
df

Unnamed: 0,A,B,C,D,E,F
0,8,8,17,7,14,9
1,10,16,15,4,9,2
2,9,10,7,4,15,6
3,18,15,12,6,9,15
4,18,10,7,1,14,11


In [38]:
print(df.dtypes)
print(df.size)
print(df.values)
print(df.shape)
print(df.items())
print(df.index)
print(df.columns)
print(df.keys())

A    int64
B    int64
C    int64
D    int64
E    int64
F    int64
dtype: object
30
[[ 8  8 17  7 14  9]
 [10 16 15  4  9  2]
 [ 9 10  7  4 15  6]
 [18 15 12  6  9 15]
 [18 10  7  1 14 11]]
(5, 6)
<generator object DataFrame.items at 0x7fec17df2e40>
RangeIndex(start=0, stop=5, step=1)
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


In [39]:
df.describe() # gives summary of complete Table data

Unnamed: 0,A,B,C,D,E,F
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,12.6,11.8,11.6,4.4,12.2,8.6
std,4.97996,3.49285,4.560702,2.302173,2.949576,4.929503
min,8.0,8.0,7.0,1.0,9.0,2.0
25%,9.0,10.0,7.0,4.0,9.0,6.0
50%,10.0,10.0,12.0,4.0,14.0,9.0
75%,18.0,15.0,15.0,6.0,14.0,11.0
max,18.0,16.0,17.0,7.0,15.0,15.0


## setting name for rows and columns

In [40]:
df

Unnamed: 0,A,B,C,D,E,F
0,8,8,17,7,14,9
1,10,16,15,4,9,2
2,9,10,7,4,15,6
3,18,15,12,6,9,15
4,18,10,7,1,14,11


In [41]:
print(df)
print(df.rename(index={1:100,4:"B"},columns={"A":"ABC","D":"DEF"}))
print(df)

    A   B   C  D   E   F
0   8   8  17  7  14   9
1  10  16  15  4   9   2
2   9  10   7  4  15   6
3  18  15  12  6   9  15
4  18  10   7  1  14  11
     ABC   B   C  DEF   E   F
0      8   8  17    7  14   9
100   10  16  15    4   9   2
2      9  10   7    4  15   6
3     18  15  12    6   9  15
B     18  10   7    1  14  11
    A   B   C  D   E   F
0   8   8  17  7  14   9
1  10  16  15  4   9   2
2   9  10   7  4  15   6
3  18  15  12  6   9  15
4  18  10   7  1  14  11


In [42]:
print(df)
print(df.rename(index={1:100,4:"B"},columns={"A":"ABC","D":"DEF"},inplace=True))
print(df)

    A   B   C  D   E   F
0   8   8  17  7  14   9
1  10  16  15  4   9   2
2   9  10   7  4  15   6
3  18  15  12  6   9  15
4  18  10   7  1  14  11
None
     ABC   B   C  DEF   E   F
0      8   8  17    7  14   9
100   10  16  15    4   9   2
2      9  10   7    4  15   6
3     18  15  12    6   9  15
B     18  10   7    1  14  11


## Indexing/Slicing for Series

In [43]:
a = np.random.randint(1,20,30).reshape(5,6)
df = pd.DataFrame(a,columns=list("ABCDEF"))
df

Unnamed: 0,A,B,C,D,E,F
0,2,8,3,1,7,9
1,7,6,5,9,1,3
2,10,9,14,8,18,18
3,13,16,1,6,16,16
4,12,18,16,14,17,14


## using `[]` notation

In [44]:
df["A"]

0     2
1     7
2    10
3    13
4    12
Name: A, dtype: int64

In [45]:
df[["A","C"]]

Unnamed: 0,A,C
0,2,3
1,7,5
2,10,14
3,13,1
4,12,16


In [46]:
df[["A","C"]]

Unnamed: 0,A,C
0,2,3
1,7,5
2,10,14
3,13,1
4,12,16


## using `.` notation

In [47]:
df.A

0     2
1     7
2    10
3    13
4    12
Name: A, dtype: int64

### using loc()

In [48]:
# <DataFrame>.loc[<row_index_name>,<col_header_name>]

In [49]:
df.loc[0,"A"]

2

In [50]:
df.loc[0,["A","D"]]

A    2
D    1
Name: 0, dtype: int64

In [51]:
df.loc[0:,["A","D"]]

Unnamed: 0,A,D
0,2,1
1,7,9
2,10,8
3,13,6
4,12,14


In [52]:
df.loc[[0,4,3],["A","D"]]

Unnamed: 0,A,D
0,2,1
4,12,14
3,13,6


In [53]:
df.loc[[0,4,3],"A":]

Unnamed: 0,A,B,C,D,E,F
0,2,8,3,1,7,9
4,12,18,16,14,17,14
3,13,16,1,6,16,16


### using iloc()

In [54]:
df

Unnamed: 0,A,B,C,D,E,F
0,2,8,3,1,7,9
1,7,6,5,9,1,3
2,10,9,14,8,18,18
3,13,16,1,6,16,16
4,12,18,16,14,17,14


In [55]:
df.iloc[0,0]

2

In [56]:
# df.iloc[0,"A"]

In [57]:
df.iloc[0,2:5]

C    3
D    1
E    7
Name: 0, dtype: int64

In [58]:
df.iloc[::-1,::-1]

Unnamed: 0,F,E,D,C,B,A
4,14,17,14,16,18,12
3,16,16,6,1,16,13
2,18,18,8,14,9,10
1,3,1,9,5,6,7
0,9,7,1,3,8,2


## To Transpose DataFrame

In [59]:
df.T

Unnamed: 0,0,1,2,3,4
A,2,7,10,13,12
B,8,6,9,16,18
C,3,5,14,1,16
D,1,9,8,6,14
E,7,1,18,16,17
F,9,3,18,16,14


# Slicing for Data Frames

In [60]:
df.iloc[::-1,::-1]

Unnamed: 0,F,E,D,C,B,A
4,14,17,14,16,18,12
3,16,16,6,1,16,13
2,18,18,8,14,9,10
1,3,1,9,5,6,7
0,9,7,1,3,8,2


In [61]:
df.iloc[1:5:2,:4:3]

Unnamed: 0,A,D
1,7,9
3,13,6


PART-2

# How to see top rows and bottom rows of a DataFrame

In [62]:
import numpy as np
import pandas as pd

In [63]:
a = np.random.randint(1,50,90).reshape(30,3)
df = pd.DataFrame(a,columns=list("ABC"))
df

Unnamed: 0,A,B,C
0,17,15,39
1,26,28,30
2,11,33,25
3,12,27,29
4,24,11,32
5,33,16,42
6,32,40,42
7,18,43,4
8,49,15,25
9,31,42,16


In [64]:
df.head()

Unnamed: 0,A,B,C
0,17,15,39
1,26,28,30
2,11,33,25
3,12,27,29
4,24,11,32


In [65]:
df.head(2)

Unnamed: 0,A,B,C
0,17,15,39
1,26,28,30


In [66]:
df.tail()

Unnamed: 0,A,B,C
25,22,26,30
26,41,1,35
27,32,10,10
28,6,29,7
29,42,8,45


In [67]:
df.tail(2)

Unnamed: 0,A,B,C
28,6,29,7
29,42,8,45


# Setting options for DataFrame visualization

In [68]:
pd.get_option("display.max_rows")

60

In [69]:
pd.get_option("display.max_columns")

20

In [70]:
pd.set_option("display.max_rows",5)

In [71]:
pd.get_option("display.max_rows")

5

In [72]:
df

Unnamed: 0,A,B,C
0,17,15,39
1,26,28,30
...,...,...,...
28,6,29,7
29,42,8,45


# - Basic Operations With Data frame

In [73]:
a = np.random.randint(1,50,25).reshape(5,5)
df = pd.DataFrame(a,columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
0,24,9,42,9,39
1,20,17,29,6,23
2,49,2,7,39,11
3,12,16,36,24,4
4,48,29,17,18,43


## Renaming Columns

In [74]:
df1 = df.rename(columns={"A":"Name"})
df1

Unnamed: 0,Name,B,C,D,E
0,24,9,42,9,39
1,20,17,29,6,23
2,49,2,7,39,11
3,12,16,36,24,4
4,48,29,17,18,43


In [75]:
df1 = df.rename(index={0:"ROW_1",1:"ROW_2"})
df1

Unnamed: 0,A,B,C,D,E
ROW_1,24,9,42,9,39
ROW_2,20,17,29,6,23
2,49,2,7,39,11
3,12,16,36,24,4
4,48,29,17,18,43


In [76]:
df1 = df.rename(index={0:"ROW_1",1:"ROW_2"},columns={"A":"Name_1","B":"Name_2"})
df1

Unnamed: 0,Name_1,Name_2,C,D,E
ROW_1,24,9,42,9,39
ROW_2,20,17,29,6,23
2,49,2,7,39,11
3,12,16,36,24,4
4,48,29,17,18,43


## adding columns in DataFrame

In [77]:
df

Unnamed: 0,A,B,C,D,E
0,24,9,42,9,39
1,20,17,29,6,23
2,49,2,7,39,11
3,12,16,36,24,4
4,48,29,17,18,43


In [78]:
df["Z"] = 100
df

Unnamed: 0,A,B,C,D,E,Z
0,24,9,42,9,39,100
1,20,17,29,6,23,100
2,49,2,7,39,11,100
3,12,16,36,24,4,100
4,48,29,17,18,43,100


In [79]:
# df["X"] = [100]
# df

In [80]:
df["X"] = [100,200,300,400,500]
df

Unnamed: 0,A,B,C,D,E,Z,X
0,24,9,42,9,39,100,100
1,20,17,29,6,23,100,200
2,49,2,7,39,11,100,300
3,12,16,36,24,4,100,400
4,48,29,17,18,43,100,500


## adding rows in DataFrame

In [81]:
df

Unnamed: 0,A,B,C,D,E,Z,X
0,24,9,42,9,39,100,100
1,20,17,29,6,23,100,200
2,49,2,7,39,11,100,300
3,12,16,36,24,4,100,400
4,48,29,17,18,43,100,500


In [82]:
df.loc["row_1"] = -10
df

Unnamed: 0,A,B,C,D,E,Z,X
0,24,9,42,9,39,100,100
1,20,17,29,6,23,100,200
...,...,...,...,...,...,...,...
4,48,29,17,18,43,100,500
row_1,-10,-10,-10,-10,-10,-10,-10


In [83]:
df.loc["row_1"] = [-10,-20,-30,-40,50,-60,70]
df

Unnamed: 0,A,B,C,D,E,Z,X
0,24,9,42,9,39,100,100
1,20,17,29,6,23,100,200
...,...,...,...,...,...,...,...
4,48,29,17,18,43,100,500
row_1,-10,-20,-30,-40,50,-60,70


In [84]:
df.loc["row_2"] = [-10,-20,-30,-40,50,-60,-70]
df

Unnamed: 0,A,B,C,D,E,Z,X
0,24,9,42,9,39,100,100
1,20,17,29,6,23,100,200
...,...,...,...,...,...,...,...
row_1,-10,-20,-30,-40,50,-60,70
row_2,-10,-20,-30,-40,50,-60,-70


## filtering data from a data frame. 

In [85]:
df

Unnamed: 0,A,B,C,D,E,Z,X
0,24,9,42,9,39,100,100
1,20,17,29,6,23,100,200
...,...,...,...,...,...,...,...
row_1,-10,-20,-30,-40,50,-60,70
row_2,-10,-20,-30,-40,50,-60,-70


### using conditions for elements

In [86]:
df[df>20] # NaN = Not a Number

Unnamed: 0,A,B,C,D,E,Z,X
0,24.0,,42.0,,39.0,100.0,100.0
1,,,29.0,,23.0,100.0,200.0
...,...,...,...,...,...,...,...
row_1,,,,,50.0,,70.0
row_2,,,,,50.0,,


In [87]:
df>20 # NaN = Not a Number

Unnamed: 0,A,B,C,D,E,Z,X
0,True,False,True,False,True,True,True
1,False,False,True,False,True,True,True
...,...,...,...,...,...,...,...
row_1,False,False,False,False,True,False,True
row_2,False,False,False,False,True,False,False


In [88]:
df.where(df>20)

Unnamed: 0,A,B,C,D,E,Z,X
0,24.0,,42.0,,39.0,100.0,100.0
1,,,29.0,,23.0,100.0,200.0
...,...,...,...,...,...,...,...
row_1,,,,,50.0,,70.0
row_2,,,,,50.0,,


### using filter() for Index and Column labels

In [89]:
df.filter("XAB")

Unnamed: 0,X,A,B
0,100,24,9
1,200,20,17
...,...,...,...
row_1,70,-10,-20
row_2,-70,-10,-20


In [90]:
df.filter("row")

0
1
...
row_1
row_2


In [91]:
df.filter(["A","C"])

Unnamed: 0,A,C
0,24,42
1,20,29
...,...,...
row_1,-10,-30
row_2,-10,-30


In [92]:
df.filter("row",axis=0)

Unnamed: 0,A,B,C,D,E,Z,X


In [93]:
df.filter("row",axis=1)

0
1
...
row_1
row_2


In [94]:
df = pd.DataFrame({"one":10,"two":20,"bbisyz":30,"bb1syz":30},index=["row_1","ROW_2","row_3"])
df

Unnamed: 0,one,two,bbisyz,bb1syz
row_1,10,20,30,30
ROW_2,10,20,30,30
row_3,10,20,30,30


In [95]:
df.filter(regex="bb[i1]|ROW")

Unnamed: 0,bbisyz,bb1syz
row_1,30,30
ROW_2,30,30
row_3,30,30


In [96]:
df.filter(regex="ROW",axis=0)

Unnamed: 0,one,two,bbisyz,bb1syz
ROW_2,10,20,30,30


## Difference between `Size` and `Count()`

In [97]:
a = np.random.randint(1,30,25).reshape(5,5)
df = pd.DataFrame(a,columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
0,22,5,1,28,14
1,24,25,14,8,17
2,9,11,6,7,8
3,17,23,9,24,21
4,12,4,11,6,28


In [98]:
df.loc[3,"B"] = np.nan # to set NaN in dataFrame
df.loc[0,"E"] = np.nan
df

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


In [99]:
df.size # This will count total number of Entries in DataFrame including NaN

25

In [100]:
df.count() # counts only NON-NULL values

A    5
B    4
C    5
D    5
E    4
dtype: int64

# Descriptive Statistics

In [101]:
df

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


## min()

In [102]:
df.min()

A    9.0
B    4.0
C    1.0
D    6.0
E    8.0
dtype: float64

In [103]:
df.min(skipna=False)

A    9.0
B    NaN
C    1.0
D    6.0
E    NaN
dtype: float64

## max()

In [104]:
df.max()

A    24.0
B    25.0
C    14.0
D    28.0
E    28.0
dtype: float64

In [105]:
df.max(axis=1)

0    28.0
1    25.0
2    11.0
3    24.0
4    28.0
dtype: float64

## count()

In [106]:
df

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


In [107]:
df.A.count()

5

In [108]:
df["A"].count()

5

## mean()

In [109]:
df.mean()

A    16.80
B    11.25
C     8.20
D    14.60
E    18.50
dtype: float64

In [110]:
df.A.mean()

16.8

## median()

In [111]:
df.median()

A    17.0
B     8.0
C     9.0
D     8.0
E    19.0
dtype: float64

## std()

In [112]:
df.std()

A     6.379655
B     9.673848
C     4.969909
D    10.526158
E     8.346656
dtype: float64

## var()

In [113]:
df.var()

A     40.700000
B     93.583333
C     24.700000
D    110.800000
E     69.666667
dtype: float64

## cov()

In [114]:
df.cov()

Unnamed: 0,A,B,C,D,E
A,40.7,34.416667,2.05,31.15,8.666667
B,34.416667,93.583333,35.0,-36.75,-40.833333
C,2.05,35.0,24.7,-33.65,14.333333
D,31.15,-36.75,-33.65,110.8,10.5
E,8.666667,-40.833333,14.333333,10.5,69.666667


## corr()

In [115]:
df.corr()

Unnamed: 0,A,B,C,D,E
A,1.0,0.483025,0.064656,0.463864,0.158345
B,0.483025,1.0,0.633018,-0.360711,-0.381246
C,0.064656,0.633018,1.0,-0.643231,0.510101
D,0.463864,-0.360711,-0.643231,1.0,0.147321
E,0.158345,-0.381246,0.510101,0.147321,1.0


## mode()

In [116]:
df

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


In [117]:
df.mode()

Unnamed: 0,A,B,C,D,E
0,9,4.0,1,6,8.0
1,12,5.0,6,7,17.0
2,17,11.0,9,8,21.0
3,22,25.0,11,24,28.0
4,24,,14,28,


## cumsum()

In [118]:
df

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


In [119]:
df.cumsum()

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,46,30.0,15,36,17.0
2,55,41.0,21,43,25.0
3,72,,30,67,46.0
4,84,45.0,41,73,74.0


## df.cummin()

In [120]:
df

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


In [121]:
df.cummin()

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,22,5.0,1,8,17.0
2,9,5.0,1,7,8.0
3,9,,1,7,8.0
4,9,4.0,1,6,8.0


## cumax()

In [122]:
df.cummax()

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,28,17.0
2,24,25.0,14,28,17.0
3,24,,14,28,21.0
4,24,25.0,14,28,28.0


## cumprod()

In [123]:
df.cumprod()

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,528,125.0,14,224,17.0
2,4752,1375.0,84,1568,136.0
3,80784,,756,37632,2856.0
4,969408,5500.0,8316,225792,79968.0


## abs()

In [124]:
df

Unnamed: 0,A,B,C,D,E
0,22,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


In [125]:
df.loc[0,'A'] = -100
df

Unnamed: 0,A,B,C,D,E
0,-100,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


In [126]:
df.abs()

Unnamed: 0,A,B,C,D,E
0,100,5.0,1,28,
1,24,25.0,14,8,17.0
2,9,11.0,6,7,8.0
3,17,,9,24,21.0
4,12,4.0,11,6,28.0


## prod()

In [127]:
df.prod()

A   -4406400.0
B       5500.0
C       8316.0
D     225792.0
E      79968.0
dtype: float64

# Dataframe transformation functions

In [128]:
a = np.random.randint(1,20,12).reshape(4,3)
df = pd.DataFrame(a,columns=list("ABC"))
df


Unnamed: 0,A,B,C
0,17,13,3
1,5,9,5
2,7,16,6
3,14,12,7


## use of apply()

In [129]:
df.apply(sum)

A    43
B    50
C    21
dtype: int64

In [130]:
df.apply(sum,axis=1)

0    33
1    19
2    29
3    33
dtype: int64

## use of applymap()

In [131]:
lam = lambda x:x**2
df.applymap(lam)

Unnamed: 0,A,B,C
0,289,169,9
1,25,81,25
2,49,256,36
3,196,144,49


## apply custom function

In [132]:
df

Unnamed: 0,A,B,C
0,17,13,3
1,5,9,5
2,7,16,6
3,14,12,7


In [133]:
def dividebyTwo(x):
    return x//2
df.applymap(dividebyTwo)

Unnamed: 0,A,B,C
0,8,6,1
1,2,4,2
2,3,8,3
3,7,6,3


In [134]:
def dividebyTwo(x):
    return x/2
df.applymap(dividebyTwo)

Unnamed: 0,A,B,C
0,8.5,6.5,1.5
1,2.5,4.5,2.5
2,3.5,8.0,3.0
3,7.0,6.0,3.5


PART-3

In [135]:
a = np.random.randint(1,20,30).reshape(6,5)
df = pd.DataFrame(a,columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
0,8,10,15,12,14
1,18,18,11,1,16
...,...,...,...,...,...
4,4,8,7,7,12
5,17,15,11,19,7


In [136]:
df.describe()

Unnamed: 0,A,B,C,D,E
count,6.000000,6.00,6.000000,6.00,6.000000
mean,12.166667,9.00,7.833333,9.50,12.333333
...,...,...,...,...,...
75%,17.750000,13.75,11.000000,12.75,15.500000
max,19.000000,18.00,15.000000,19.00,18.000000


# using info()

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       6 non-null      int64
 1   B       6 non-null      int64
 2   C       6 non-null      int64
 3   D       6 non-null      int64
 4   E       6 non-null      int64
dtypes: int64(5)
memory usage: 368.0 bytes


# Dropping columns/rows and reason behind this

In [138]:
df

Unnamed: 0,A,B,C,D,E
0,8,10,15,12,14
1,18,18,11,1,16
...,...,...,...,...,...
4,4,8,7,7,12
5,17,15,11,19,7


In [139]:
df.loc[1,"B"] = np.nan
df.loc[5,"E"] = np.nan
df.loc[2,"E"] = np.nan
df

Unnamed: 0,A,B,C,D,E
0,8,10.0,15,12,14.0
1,18,,11,1,16.0
...,...,...,...,...,...
4,4,8.0,7,7,12.0
5,17,15.0,11,19,


# Replacing Values

In [140]:
df.replace(np.nan, -100000)

Unnamed: 0,A,B,C,D,E
0,8,10.0,15,12,14.0
1,18,-100000.0,11,1,16.0
...,...,...,...,...,...
4,4,8.0,7,7,12.0
5,17,15.0,11,19,-100000.0


In [141]:
# df.replace(np.nan, -100000,inplace=True)

## for Whole data set

In [142]:
df.replace(np.nan, -100000)

Unnamed: 0,A,B,C,D,E
0,8,10.0,15,12,14.0
1,18,-100000.0,11,1,16.0
...,...,...,...,...,...
4,4,8.0,7,7,12.0
5,17,15.0,11,19,-100000.0


## for column/feature wise

In [143]:
df.B.replace({np.nan:-20000,"test":-10000})

0       10.0
1   -20000.0
      ...   
4        8.0
5       15.0
Name: B, Length: 6, dtype: float64

## using regex

In [144]:
df

Unnamed: 0,A,B,C,D,E
0,8,10.0,15,12,14.0
1,18,,11,1,16.0
...,...,...,...,...,...
4,4,8.0,7,7,12.0
5,17,15.0,11,19,


In [145]:
df["F"] = "PyCSR"
df["Z"] = "PyCSR2"
df

Unnamed: 0,A,B,C,D,E,F,Z
0,8,10.0,15,12,14.0,PyCSR,PyCSR2
1,18,,11,1,16.0,PyCSR,PyCSR2
...,...,...,...,...,...,...,...
4,4,8.0,7,7,12.0,PyCSR,PyCSR2
5,17,15.0,11,19,,PyCSR,PyCSR2


In [146]:
df.replace("PyCSR",-99999,regex=True)

Unnamed: 0,A,B,C,D,E,F,Z
0,8,10.0,15,12,14.0,-99999,-99999
1,18,,11,1,16.0,-99999,-99999
...,...,...,...,...,...,...,...
4,4,8.0,7,7,12.0,-99999,-99999
5,17,15.0,11,19,,-99999,-99999


In [147]:
df.replace("\w+\d$",-99999,regex=True)

Unnamed: 0,A,B,C,D,E,F,Z
0,8,10.0,15,12,14.0,PyCSR,-99999
1,18,,11,1,16.0,PyCSR,-99999
...,...,...,...,...,...,...,...
4,4,8.0,7,7,12.0,PyCSR,-99999
5,17,15.0,11,19,,PyCSR,-99999


# Re-indexing

In [148]:
df1 = pd.DataFrame(np.random.randint(1,20,10).reshape(5,2),columns=list("AB"))
df2 = pd.DataFrame(np.random.randint(1,20,10).reshape(5,2),columns=list("CD"))
df1

Unnamed: 0,A,B
0,14,14
1,18,12
2,16,6
3,13,15
4,3,8


In [149]:
df2

Unnamed: 0,C,D
0,1,4
1,14,11
2,11,4
3,13,18
4,18,1


In [150]:
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301],
'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
index=index)
df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


In [151]:
new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10','Chrome']
df.reindex(new_index)

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02


In [152]:
new_index = ["Opera",'Safari', 'Iceweasel', 'Comodo Dragon', 'IE10','Chrome']
df.reindex(new_index)

Unnamed: 0,http_status,response_time
Opera,,
Safari,404.0,0.07
...,...,...
IE10,404.0,0.08
Chrome,200.0,0.02


# Handling Missing Values using `fillna()`

In [153]:
a = np.random.randint(1,10,20).reshape(5,4)
df = pd.DataFrame(a,columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
0,6,7,8,1
1,6,1,5,8
2,4,2,8,6
3,6,7,4,3
4,7,4,3,1


In [154]:
df.loc[0,"D"] = np.nan
df.loc[3,"C"] = np.nan
df

Unnamed: 0,A,B,C,D
0,6,7,8.0,
1,6,1,5.0,8.0
2,4,2,8.0,6.0
3,6,7,,3.0
4,7,4,3.0,1.0


## using some values

In [155]:
df.fillna(-9000)

Unnamed: 0,A,B,C,D
0,6,7,8.0,-9000.0
1,6,1,5.0,8.0
2,4,2,8.0,6.0
3,6,7,-9000.0,3.0
4,7,4,3.0,1.0


## using statistical inference

In [156]:
df.C.fillna(df.C.mean())

0    8.0
1    5.0
2    8.0
3    6.0
4    3.0
Name: C, dtype: float64

In [157]:
df.C.fillna(df.C.median())

0    8.0
1    5.0
2    8.0
3    6.5
4    3.0
Name: C, dtype: float64

In [158]:
df.C.fillna(df.C.std())

0    8.00000
1    5.00000
2    8.00000
3    2.44949
4    3.00000
Name: C, dtype: float64

## checking missing values using various methods

In [159]:
df

Unnamed: 0,A,B,C,D
0,6,7,8.0,
1,6,1,5.0,8.0
2,4,2,8.0,6.0
3,6,7,,3.0
4,7,4,3.0,1.0


### isnull()

In [160]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,True
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,False,False


In [161]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

### notnull()

In [162]:
df.notnull()

Unnamed: 0,A,B,C,D
0,True,True,True,False
1,True,True,True,True
2,True,True,True,True
3,True,True,False,True
4,True,True,True,True


In [163]:
df.notnull().sum()

A    5
B    5
C    4
D    4
dtype: int64

### isna()

In [164]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,True
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,False,False


In [165]:
df.isna().sum()

A    0
B    0
C    1
D    1
dtype: int64

# Typecasting columns values using astype()

In [166]:
df

Unnamed: 0,A,B,C,D
0,6,7,8.0,
1,6,1,5.0,8.0
2,4,2,8.0,6.0
3,6,7,,3.0
4,7,4,3.0,1.0


In [167]:
df.A.dtype

dtype('int64')

In [168]:
df.loc[0,'A'] = "10"
df

Unnamed: 0,A,B,C,D
0,10,7,8.0,
1,6,1,5.0,8.0
2,4,2,8.0,6.0
3,6,7,,3.0
4,7,4,3.0,1.0


In [169]:
# df.A.sum()

In [170]:
df.A.dtype

dtype('O')

In [171]:
df.A = df.A.astype("int8")
df

Unnamed: 0,A,B,C,D
0,10,7,8.0,
1,6,1,5.0,8.0
2,4,2,8.0,6.0
3,6,7,,3.0
4,7,4,3.0,1.0


In [172]:
df.A.dtype

dtype('int8')

In [173]:
df.A.sum()

33

# Iterating over DataFrame

## iteritems()

In [174]:
df

Unnamed: 0,A,B,C,D
0,10,7,8.0,
1,6,1,5.0,8.0
2,4,2,8.0,6.0
3,6,7,,3.0
4,7,4,3.0,1.0


In [175]:
for c,s in df.iteritems():
    print(c)
    print(s)

A
0    10
1     6
2     4
3     6
4     7
Name: A, dtype: int8
B
0    7
1    1
2    2
3    7
4    4
Name: B, dtype: int64
C
0    8.0
1    5.0
2    8.0
3    NaN
4    3.0
Name: C, dtype: float64
D
0    NaN
1    8.0
2    6.0
3    3.0
4    1.0
Name: D, dtype: float64


## iterrows()

In [176]:
for i,r in df.iterrows():
    print(i)
    print(r)    

0
A    10.0
B     7.0
C     8.0
D     NaN
Name: 0, dtype: float64
1
A    6.0
B    1.0
C    5.0
D    8.0
Name: 1, dtype: float64
2
A    4.0
B    2.0
C    8.0
D    6.0
Name: 2, dtype: float64
3
A    6.0
B    7.0
C    NaN
D    3.0
Name: 3, dtype: float64
4
A    7.0
B    4.0
C    3.0
D    1.0
Name: 4, dtype: float64


## itertuples()

In [177]:
for r in df.itertuples():
    print(r)

Pandas(Index=0, A=10, B=7, C=8.0, D=nan)
Pandas(Index=1, A=6, B=1, C=5.0, D=8.0)
Pandas(Index=2, A=4, B=2, C=8.0, D=6.0)
Pandas(Index=3, A=6, B=7, C=nan, D=3.0)
Pandas(Index=4, A=7, B=4, C=3.0, D=1.0)


# sort, join and merge operations on DataFrame

## - Sort row and column labels

In [178]:
a = np.random.randint(-5,20,10).reshape(5,2)
df = pd.DataFrame(a,columns=["A","B"])
df

Unnamed: 0,A,B
0,13,5
1,-5,8
2,-4,8
3,11,-4
4,-3,13


In [179]:
df.sort_index()

Unnamed: 0,A,B
0,13,5
1,-5,8
2,-4,8
3,11,-4
4,-3,13


In [180]:
df.sort_index(ascending=False)

Unnamed: 0,A,B
4,-3,13
3,11,-4
2,-4,8
1,-5,8
0,13,5


In [181]:
df.sort_index(ascending=False,axis=1)

Unnamed: 0,B,A
0,5,13
1,8,-5
2,8,-4
3,-4,11
4,13,-3


## - Sort row and column Values

In [182]:
df

Unnamed: 0,A,B
0,13,5
1,-5,8
2,-4,8
3,11,-4
4,-3,13


In [183]:
df.sort_values(by="A")

Unnamed: 0,A,B
1,-5,8
2,-4,8
4,-3,13
3,11,-4
0,13,5


In [184]:
df.sort_values(by="A",ascending=False)

Unnamed: 0,A,B
0,13,5
3,11,-4
4,-3,13
2,-4,8
1,-5,8


In [185]:
df.sort_values(by=0,ascending=False,axis=1)

Unnamed: 0,A,B
0,13,5
1,-5,8
2,-4,8
3,11,-4
4,-3,13


In [186]:
df.sort_values(by=0,ascending=True,axis=1)

Unnamed: 0,B,A
0,5,13
1,8,-5
2,8,-4
3,-4,11
4,13,-3


## - Merging  dataframes

## - Joining  dataframes

## Concatenating dataframes

# - Importing external data to DataFrame

## reading `CSV` file

## reading `XLSX` file

## reading xlsx file with specific sheet name/number

## reading `HTML` file

## WebScrapping and fetching Table data

# - Exporting DataFrame to external file .csv,.html etc