# Data Structures And Basic Functionality

In [1]:
import numpy as np       #importing module
import pandas as pd

In [2]:
s=pd.Series([1,4,6,np.nan,8])      #creating series
s

0    1.0
1    4.0
2    6.0
3    NaN
4    8.0
dtype: float64

In [3]:
df=pd.DataFrame(
    {
        "A": (1.0,5.0,3.0,9.0),                         #we can pass only 1 element,pandas detect its length automatically
        "B": 1,                                         #1.0 and 1 is different
        "C": ["say","day","ray","pay"],                 #we are passing an array
        "D": pd.Categorical(["foo","faa","foo","faa"])  #all array length should be same
    }
)

In [4]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,say,foo
1,5.0,1,day,faa
2,3.0,1,ray,foo
3,9.0,1,pay,faa


In [5]:
df.dtypes                                              #seeing type of columns

A     float64
B       int64
C      object
D    category
dtype: object

In [6]:
df.index              #to show row index

RangeIndex(start=0, stop=4, step=1)

In [7]:
df.columns            #to show column names

Index(['A', 'B', 'C', 'D'], dtype='object')

We already see sorting dataframes through index. Now we will see sorting through a column.


In [8]:
df.sort_values("A")          #We sort whole dataframe according to column A

Unnamed: 0,A,B,C,D
0,1.0,1,say,foo
2,3.0,1,ray,foo
1,5.0,1,day,faa
3,9.0,1,pay,faa


We already see .loc and .iloc function.Now we will see .at and .iat function. .at function is used to get only 1 value.

In [9]:
df.at[2,"C"]     #.at is same as .loc--only differrence is .at takes one position and return one value only

'ray'

In [10]:
df.iat[2,3]

'foo'

Another function is .isin() .It returns an boolean series.

In [11]:
df["C"].isin(["say","pay"])

0     True
1    False
2    False
3     True
Name: C, dtype: bool

In [12]:
df.loc[df["C"].isin(["say","pay"])]        #so here we are passing an boolean series and it takes those rows out where boolean value is TRUe

Unnamed: 0,A,B,C,D
0,1.0,1,say,foo
3,9.0,1,pay,faa


Now we will move to missing value handling. Here we do 3 type of operations. We will explore that with 3 differren commands.

In [13]:
dff = pd.DataFrame(np.arange(21, dtype=np.float64).reshape(7, 3), columns=list("ABC"))
dff.iloc[3:6, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan                     #creating a dataframe

In [14]:
dff                                           #printing dataframe

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,,10.0,11.0
4,,,14.0
5,,,
6,18.0,19.0,


First command is DataFrame.dropna() .It is useful to drop rows or columns with any or All NaN value.
It takes parameter ->
- axis: 0,1 -- which denotes whether NaN value should be dropped from rows or columns
- how: "any","all" -- "any" denotes that if row or column has atleast one NA then it will be dropped. "all" denotes that if row or column has all NA      then it will be dropped
- inplace: TRUE or False -- whether changes should be committed on actual dataframe

In [15]:
dff.dropna(axis=0)         #row 3,4,5,6 is dropped as they have atleast one NA

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0


In [16]:
dff.dropna(axis=0,how="all")          #row 5 is dropped as this row has all NA 

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,,10.0,11.0
4,,,14.0
6,18.0,19.0,


In [17]:
dff.dropna(axis=1)                  #as every column has NA,this command removes all columns

0
1
2
3
4
5
6


Second command is DataFrame.fillna() .This command also takes parameter value,axis and inplace.

In [18]:
dff.fillna(value=5)         #5 is filled in all NA

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,5.0,10.0,11.0
4,5.0,5.0,14.0
5,5.0,5.0,5.0
6,18.0,19.0,5.0


Final command is pd.isna() .It returns boolean array.

In [19]:
pd.isna(dff)               #uses of .isna

Unnamed: 0,A,B,C
0,False,False,False
1,False,False,False
2,False,False,False
3,True,False,False
4,True,True,False
5,True,True,True
6,False,False,True


Now we will explore merging of two dataframes. Here we have two differrent methods -
- CONCAT
- JOIN

CONCAT function just merge 2 dataframes with several advanced options.We explore those parameters through examples.

In [20]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df2 = pd.DataFrame([['c', 3], ['d', 4]],
                   columns=['letter', 'number'])         #creating df1,df2

In [21]:
df1                                                      #showing df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [22]:
df2                                                      #showing df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [23]:
pd.concat([df1,df2])                                      #this is basic concat

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [24]:
pd.concat([df1,df2],ignore_index=True)                       #it will fix the index of previous dataframe

Unnamed: 0,letter,number
0,a,1
1,b,2
2,c,3
3,d,4


In [25]:
pd.concat([df1,df2],axis=1)                                  #it will concat columnwise

Unnamed: 0,letter,number,letter.1,number.1
0,a,1,c,3
1,b,2,d,4


In [26]:
pd.concat([df1,df2],keys=["s1","s2"])                      #it creates multiindex or subindex

Unnamed: 0,Unnamed: 1,letter,number
s1,0,a,1
s1,1,b,2
s2,0,c,3
s2,1,d,4


In [27]:
pd.concat([df1,df2],keys=["s1","s2"],names=["main","Sub"])     #it provides names to index

Unnamed: 0_level_0,Unnamed: 1_level_0,letter,number
main,Sub,Unnamed: 2_level_1,Unnamed: 3_level_1
s1,0,a,1
s1,1,b,2
s2,0,c,3
s2,1,d,4


In [28]:
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog'],['e',5,'duck']],
                   columns=['letter', 'number', 'animal'])            #making another dataframe
df3

Unnamed: 0,letter,number,animal
0,c,3,cat
1,d,4,dog
2,e,5,duck


In [29]:
pd.concat([df1,df3])                              #When we concat 2 differrent shaped dataframe new rows are filled with NaN

Unnamed: 0,letter,number,animal
0,a,1,
1,b,2,
0,c,3,cat
1,d,4,dog
2,e,5,duck


In [30]:
pd.concat([df1,df3],axis=1)                       #joinning through axis=1

Unnamed: 0,letter,number,letter.1,number.1,animal
0,a,1.0,c,3,cat
1,b,2.0,d,4,dog
2,,,e,5,duck


In [31]:
df5=pd.concat([df1,df3],join='inner',ignore_index=True)           #if we pass join='inner' then only common column will be returned

In [32]:
df5

Unnamed: 0,letter,number
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [33]:
df6=pd.DataFrame({
    "letter":['m','n','o'],
    "number":[7,8,9]
},index=[3,4,5])                    #creating DataFrame

In [34]:
df6                                 #printing DataFrame

Unnamed: 0,letter,number
3,m,7
4,n,8
5,o,9


In [35]:
pd.concat([df5,df6],axis=1)         #note concat mainly joins over index and changes empty value in NaN

Unnamed: 0,letter,number,letter.1,number.1
0,a,1.0,,
1,b,2.0,,
2,c,3.0,,
3,d,4.0,m,7.0
4,e,5.0,n,8.0
5,,,o,9.0


In [36]:
pd.concat([df5,df6],axis=1,join="inner")    #using inner

Unnamed: 0,letter,number,letter.1,number.1
3,d,4,m,7
4,e,5,n,8


How inner works? if axis=0 then all rows are selected and common column is selected. If axis=1 all column is selected and common row is selected.

In [37]:
df7=pd.DataFrame({
    "letter":['m','n','o'],
    "number":[7,8,9]
},index=[5,6,7])   
df7

Unnamed: 0,letter,number
5,m,7
6,n,8
7,o,9


In [38]:
pd.concat([df5,df7],axis=1,join="inner") 

Unnamed: 0,letter,number,letter.1,number.1


No common row and hence nothing is returned.