# Pandas DataFrame

In [1]:
import numpy as np
import pandas as pd

#### Creating DataFrame by using two Series...

In [2]:
population_dict = {
    'AP': 123,
    'UP': 234,
    'TS': 345,
    'KA': 456
}
series1 = pd.Series(population_dict)
series1

AP    123
UP    234
TS    345
KA    456
dtype: int64

In [5]:
area_dict = {
    'AP': 789123,
    'UP': 456345,
    'TS': 123567,
    'KA': 369789
}
series2 = pd.Series(area_dict)
series2

AP    789123
UP    456345
TS    123567
KA    369789
dtype: int64

In [17]:
data1 = pd.DataFrame({'population':series1,'area':series2})
data1

Unnamed: 0,population,area
AP,123,789123
UP,234,456345
TS,345,123567
KA,456,369789


In [12]:
data1.index

Index(['AP', 'UP', 'TS', 'KA'], dtype='object')

In [13]:
data1.columns

Index(['population', 'area'], dtype='object')

In [14]:
data1.area

AP    789123
UP    456345
TS    123567
KA    369789
Name: area, dtype: int64

#### Creating DataFrame by using single Series Object

In [27]:
data2 = pd.DataFrame(series1, columns =['population'])

In [28]:
data2

Unnamed: 0,population
AP,123
UP,234
TS,345
KA,456


#### Creating DataFraame by using list

In [29]:
data3 = pd.DataFrame({'a':i, 'b': 2*i} for i in range(3))
data3

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [33]:
data4 = pd.DataFrame({'a':1,'b':2},{'b':3,'c':5})
data4

Unnamed: 0,a,b
b,1,2
c,1,2


In [34]:
data4 = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':5}])
data4

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,5.0


##### Creating DataFrame by using the two-dimensional numpy array

In [39]:
randval = np.random.rand(3,2)
randval

array([[0.35559199, 0.09857403],
       [0.33124932, 0.91946751],
       [0.18774559, 0.46393077]])

In [38]:
data5 = pd.DataFrame(np.random.rand(3,2),columns = ['foo','bar'] , index = ['a','b','c'])
data5

Unnamed: 0,foo,bar
a,0.807798,0.984278
b,0.919307,0.199967
c,0.632158,0.264938


##### Index Object and index is immutable array

In [53]:
data5.index

Index(['a', 'b', 'c'], dtype='object')

In [51]:
ind1 = data5.index

In [46]:
ind = pd.Index([2,3,4])

In [52]:
ind1[1]

'b'

In [47]:
ind[1]

3

In [48]:
ind[::2]

Int64Index([2, 4], dtype='int64')

In [49]:
ind[1] = 5

TypeError: Index does not support mutable operations

##### index as ordered Set

In [54]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [55]:
indA

Int64Index([1, 3, 5, 7, 9], dtype='int64')

In [56]:
indB

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [58]:
validindex = indA & indB
validindex

  validindex = indA & indB


Int64Index([3, 5, 7], dtype='int64')

In [60]:
indA | indB

  indA | indB


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [61]:
indA ^ indB

  indA ^ indB


Int64Index([1, 2, 9, 11], dtype='int64')

In [62]:
data1

Unnamed: 0,population,area
AP,123,789123
UP,234,456345
TS,345,123567
KA,456,369789


In [63]:
data1.area

AP    789123
UP    456345
TS    123567
KA    369789
Name: area, dtype: int64

In [64]:
data1.area is data1['area']

True

#### To add a new column 

In [67]:
data1['density'] =  data1['area']/data1['population'] 

In [68]:
data1

Unnamed: 0,population,area,density
AP,123,789123,6415.634146
UP,234,456345,1950.192308
TS,345,123567,358.165217
KA,456,369789,810.940789


In [70]:
data1.items

<bound method DataFrame.items of     population    area      density
AP         123  789123  6415.634146
UP         234  456345  1950.192308
TS         345  123567   358.165217
KA         456  369789   810.940789>

In [72]:
data1.values

array([[1.23000000e+02, 7.89123000e+05, 6.41563415e+03],
       [2.34000000e+02, 4.56345000e+05, 1.95019231e+03],
       [3.45000000e+02, 1.23567000e+05, 3.58165217e+02],
       [4.56000000e+02, 3.69789000e+05, 8.10940789e+02]])

##### To transver the data 

In [74]:
data1.T

Unnamed: 0,AP,UP,TS,KA
population,123.0,234.0,345.0,456.0
area,789123.0,456345.0,123567.0,369789.0
density,6415.634146,1950.192308,358.165217,810.940789


In [75]:
data1.values[0]

array([1.23000000e+02, 7.89123000e+05, 6.41563415e+03])

In [80]:
data1.iloc[:3,:2]

Unnamed: 0,population,area
AP,123,789123
UP,234,456345
TS,345,123567


In [81]:
data1.loc[:'TS',:'area']

Unnamed: 0,population,area
AP,123,789123
UP,234,456345
TS,345,123567


In [83]:
# data1.ix[:3 , :'area']    not in use ....{Combination of loc and iloc}

In [86]:
data1.loc[data1.density >100, ['population','density']]

Unnamed: 0,population,density
AP,123,6415.634146
UP,234,1950.192308
TS,345,358.165217
KA,456,810.940789


In [87]:
data1.loc[data1.density >100]

Unnamed: 0,population,area,density
AP,123,789123,6415.634146
UP,234,456345,1950.192308
TS,345,123567,358.165217
KA,456,369789,810.940789


In [90]:
data1.loc[['UP','KA']]

Unnamed: 0,population,area,density
UP,234,456345,1950.192308
KA,456,369789,810.940789


In [94]:
data1

Unnamed: 0,population,area,density
AP,123,789123,6415.634146
UP,234,456345,1950.192308
TS,345,123567,358.165217
KA,456,369789,810.940789


#### Operating with nulls

In [96]:
data1.isnull()

Unnamed: 0,population,area,density
AP,False,False,False
UP,False,False,False
TS,False,False,False
KA,False,False,False


In [101]:
data1[data1.notnull()]

Unnamed: 0,population,area,density
AP,123,789123,6415.634146
UP,234,456345,1950.192308
TS,345,123567,358.165217
KA,456,369789,810.940789


In [105]:
data1['population'].isnull()#.sum()

AP    False
UP    False
TS    False
KA    False
Name: population, dtype: bool

#### Creating dataFrame by manually

In [124]:
df1 = pd.DataFrame([[1,np.nan,5],[2,3,np.nan],[5,np.nan,7]])

In [125]:
df1

Unnamed: 0,0,1,2
0,1,,5.0
1,2,3.0,
2,5,,7.0


In [126]:
df1.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,False,False,True
2,False,True,False


In [127]:
df1[0].isnull()

0    False
1    False
2    False
Name: 0, dtype: bool

In [128]:
df1.notnull()

Unnamed: 0,0,1,2
0,True,False,True
1,True,True,False
2,True,False,True


In [129]:
df1[1].notnull()

0    False
1     True
2    False
Name: 1, dtype: bool

In [130]:
df1

Unnamed: 0,0,1,2
0,1,,5.0
1,2,3.0,
2,5,,7.0


In [131]:
df1.dropna(axis = 1) #df1.dropna(axis = 'columns')  # column wise deleting the null values ... 

Unnamed: 0,0
0,1
1,2
2,5


In [132]:
df1

Unnamed: 0,0,1,2
0,1,,5.0
1,2,3.0,
2,5,,7.0


In [144]:
df1[df1[2].notnull()]    # filtering not null values from 2 colummn ...

Unnamed: 0,0,1,2
0,1,,5.0
2,5,,7.0
