
## intro to data frame. data frame is a tabular data structure having m rows (axis=0) and n columns (axis=1) where each row and column has a label

In [1]:
import pandas as pd
import numpy as np

In [47]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


## data frame thru list where df will have one column with values 0..n-1 and default label index

In [2]:
data = [1,2,3,4,5]
df = pd.DataFrame(data)
print(df)

   0
0  1
1  2
2  3
3  4
4  5


## default columns labels are 0..n-1, Similar to row index

In [49]:
df.columns

RangeIndex(start=0, stop=1, step=1)

In [50]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [51]:
df[0]

0    1
1    2
2    3
3    4
4    5
Name: 0, dtype: int64

## default col names/ row index seems ambuigious or may reduce clarity

In [52]:
df[0:2]

Unnamed: 0,0
0,1
1,2


In [14]:
df[0]

0    1
1    2
2    3
3    4
4    5
Name: 0, dtype: int64

## the default row labels will be used as string labels, so used carefully

In [53]:
df.iloc[0:2,0]

0    1
1    2
Name: 0, dtype: int64

In [54]:
df.loc[0:2,0]

0    1
1    2
2    3
Name: 0, dtype: int64

#data frame of two columns from 2D-list

In [55]:
data = [['Alia',10],['Bob',12],['Clarke',13],['xxx',20]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)

     Name  Age
0    Alia   10
1     Bob   12
2  Clarke   13
3     xxx   20


In [18]:
df['Name']

0      Alia
1       Bob
2    Clarke
3       xxx
Name: Name, dtype: object

In [57]:
df.loc[1:,'Name']

1       Bob
2    Clarke
3       xxx
Name: Name, dtype: object

In [58]:
df[['Name','Age']]

Unnamed: 0,Name,Age
0,Alia,10
1,Bob,12
2,Clarke,13
3,xxx,20


In [None]:
df['Name','Age']

## data frame using dictionary and column names are that of keys and rows numbered from 0..m-1

In [60]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,32]}
df = pd.DataFrame(data)
print(df)

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   32


In [62]:
df.size

8

## Explict naming rows by giving user-defined index values

In [2]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['I','II','III','IV'])
print(df)

      Name  Age
I      Tom   28
II    Jack   34
III  Steve   29
IV   Ricky   42


In [64]:
df.index.name='sno'

In [65]:
df

Unnamed: 0_level_0,Name,Age
sno,Unnamed: 1_level_1,Unnamed: 2_level_1
I,Tom,28
II,Jack,34
III,Steve,29
IV,Ricky,42


In [66]:
df.columns

Index(['Name', 'Age'], dtype='object')

In [71]:
df.loc[['II','IV'],'Name']

sno
II     Jack
IV    Ricky
Name: Name, dtype: object

In [72]:
ds1=pd.Series(range(4),index=['I','II','III','IV'])
ds1

I      0
II     1
III    2
IV     3
dtype: int64

In [76]:
ds1['I']


0

In [77]:
ds1.I

0

## A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute notation
Note that while using attribute notation, attribute must be a valid variable


In [78]:
print(df['Name'])
print(df.Age)

sno
I        Tom
II      Jack
III    Steve
IV     Ricky
Name: Name, dtype: object
sno
I      28
II     34
III    29
IV     42
Name: Age, dtype: int64


In [None]:
df[['Name','Age']]

Unnamed: 0,Name,Age
I,Tom,28
II,Jack,34
III,Steve,29
IV,Ricky,42


In [80]:
df[{'Name','Age'}]

Unnamed: 0_level_0,Name,Age
sno,Unnamed: 1_level_1,Unnamed: 2_level_1
I,Tom,28
II,Jack,34
III,Steve,29
IV,Ricky,42


## Accessing particular row and its subset:  by  its row label  using loc[]
iloc: integer location i.e. default index values

In [81]:
(df.loc['II'])
(df.iloc[2])


Name    Steve
Age        29
Name: III, dtype: object

In [None]:
print(df.loc['II'])
print(df.iloc[2])

Name    Jack
Age       34
Name: II, dtype: object
Name    Steve
Age        29
Name: III, dtype: object


In [82]:
display(df)

Unnamed: 0_level_0,Name,Age
sno,Unnamed: 1_level_1,Unnamed: 2_level_1
I,Tom,28
II,Jack,34
III,Steve,29
IV,Ricky,42


In [33]:
df[0:2]

Unnamed: 0_level_0,Name,Age
sno,Unnamed: 1_level_1,Unnamed: 2_level_1
I,Tom,28
II,Jack,34


## creating a new dataframe using series and then adding a new column in data frame

In [3]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'e', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)

   one  two
a  1.0  1.0
b  NaN  2.0
c  3.0  3.0
d  NaN  4.0
e  2.0  NaN


In [13]:
df.values

array([[ 1.,  1.],
       [nan,  2.],
       [ 3.,  3.],
       [nan,  4.],
       [ 2., nan]])

## Adding a new column to an existing DataFrame object with column label by passing new series. New column is added at the end of DF
## while adding a new column, only values with existing row index will be added. Non-specified index value will take NaN

In [4]:
print ("Adding a new column by passing as Series:")
df['three']=pd.Series([10,20,30],index=['a','b','c'])
print(df)


Adding a new column by passing as Series:
   one  two  three
a  1.0  1.0   10.0
b  NaN  2.0   20.0
c  3.0  3.0   30.0
d  NaN  4.0    NaN
e  2.0  NaN    NaN


## Adding a new column using the some calculations on existing columns in DataFrame

In [89]:
df['four']=df['one']+df['three']

print(df)

   one  two  three  four
a  1.0  1.0   10.0  11.0
b  NaN  2.0   20.0   NaN
c  3.0  3.0   30.0  33.0
d  NaN  4.0    NaN   NaN
e  2.0  NaN    NaN   NaN


## Guess what?

In [90]:
df['five']=pd.Series([10,20,30],index=['e','f','c'])
print(df)

   one  two  three  four  five
a  1.0  1.0   10.0  11.0   NaN
b  NaN  2.0   20.0   NaN   NaN
c  3.0  3.0   30.0  33.0  30.0
d  NaN  4.0    NaN   NaN   NaN
e  2.0  NaN    NaN   NaN  10.0


In [91]:
df1=pd.DataFrame()
s1=pd.Series([10,20,30],index=['e','f','c'])
df1['first']=s1
df1

Unnamed: 0,first
e,10
f,20
c,30


In [92]:
s2=pd.Series([100,200,300],index=['e','f','d'])
df1['second']=s2
df1

Unnamed: 0,first,second
e,10,100.0
f,20,200.0
c,30,


## overwriting existing column values

In [39]:
df

Unnamed: 0,one,two,three,four
a,1.0,1.0,10.0,11.0
b,,2.0,20.0,
c,3.0,3.0,30.0,33.0
d,,4.0,,
e,2.0,,,


In [93]:
df['five']=0
print(df)


   one  two  three  four  five
a  1.0  1.0   10.0  11.0     0
b  NaN  2.0   20.0   NaN     0
c  3.0  3.0   30.0  33.0     0
d  NaN  4.0    NaN   NaN     0
e  2.0  NaN    NaN   NaN     0


#multiple ways of setting values of a column

In [58]:
df['seven']=7

In [94]:
df['six']=df.one>1
print(df)

   one  two  three  four  five    six
a  1.0  1.0   10.0  11.0     0  False
b  NaN  2.0   20.0   NaN     0  False
c  3.0  3.0   30.0  33.0     0   True
d  NaN  4.0    NaN   NaN     0  False
e  2.0  NaN    NaN   NaN     0   True


## insert a new column at the specified location

In [95]:
df.insert(1,"NEW",[11,22,33,44,55])

In [96]:
df

Unnamed: 0,one,NEW,two,three,four,five,six
a,1.0,11,1.0,10.0,11.0,0,False
b,,22,2.0,20.0,,0,False
c,3.0,33,3.0,30.0,33.0,0,True
d,,44,4.0,,,0,False
e,2.0,55,,,,0,True


## Three ways to remove a column 
1.  del function : in-place change, to remove columns
2.   drop method: needs to specify axis to delete rows/columns, no in-place change
3.   pop(): specify which item to be removed, in-place change


In [97]:
del df['five']
print(df)

   one  NEW  two  three  four    six
a  1.0   11  1.0   10.0  11.0  False
b  NaN   22  2.0   20.0   NaN  False
c  3.0   33  3.0   30.0  33.0   True
d  NaN   44  4.0    NaN   NaN  False
e  2.0   55  NaN    NaN   NaN   True


In [None]:
df.drop()

In [100]:
d1=df.drop('one',axis=1)
display(d1)
display(df)

Unnamed: 0,NEW,two,three,four,six
a,11,1.0,10.0,11.0,False
b,22,2.0,20.0,,False
c,33,3.0,30.0,33.0,True
d,44,4.0,,,False
e,55,,,,True


Unnamed: 0,one,NEW,two,three,four,six
a,1.0,11,1.0,10.0,11.0,False
b,,22,2.0,20.0,,False
c,3.0,33,3.0,30.0,33.0,True
d,,44,4.0,,,False
e,2.0,55,,,,True


#to remove rows using drop()

In [101]:
d1=df.drop(['a','b'],axis=0)
d1

Unnamed: 0,one,NEW,two,three,four,six
c,3.0,33,3.0,30.0,33.0,True
d,,44,4.0,,,False
e,2.0,55,,,,True


#by default axis=0

In [102]:
df.drop(['a','b'])
df

Unnamed: 0,one,NEW,two,three,four,six
a,1.0,11,1.0,10.0,11.0,False
b,,22,2.0,20.0,,False
c,3.0,33,3.0,30.0,33.0,True
d,,44,4.0,,,False
e,2.0,55,,,,True


In [72]:
df

Unnamed: 0,one,two,three,four,seven,six
a,1.0,1.0,10.0,11.0,7,False
b,,2.0,20.0,,7,False
c,3.0,3.0,30.0,33.0,7,True
d,,4.0,,,7,False
e,2.0,,,,7,True


#deleting a column using pop()

In [103]:
df.pop('two')
print(df)

   one  NEW  three  four    six
a  1.0   11   10.0  11.0  False
b  NaN   22   20.0   NaN  False
c  3.0   33   30.0  33.0   True
d  NaN   44    NaN   NaN  False
e  2.0   55    NaN   NaN   True


## accessing a single row using loc[] gives output as series with name of series as name of row

In [None]:
df.loc[{'a','e'},('seven','six')]

#difference in loc and iloc: loc takes labels abd iloc takes integer values of both rows and columns

In [78]:
df

Unnamed: 0,one,three,four,seven,six
a,1.0,10.0,11.0,7,False
b,,20.0,,7,False
c,3.0,30.0,33.0,7,True
d,,,,7,False
e,2.0,,,7,True


In [None]:
print(df.loc[['a','d'],['one','three']])
df

In [80]:
df.iloc[2:,:2]

Unnamed: 0,one,three
c,3.0,30.0
d,,
e,2.0,


In [None]:
df.loc[['d','e'],['one','three']]

Unnamed: 0,one,three
d,,
e,2.0,


In [105]:
df

Unnamed: 0,one,NEW,three,four,six
a,1.0,11,10.0,11.0,False
b,,22,20.0,,False
c,3.0,33,30.0,33.0,True
d,,44,,,False
e,2.0,55,,,True


In [81]:
print(df.loc[{'a','b'}])
print("another way")
print(df.iloc[[2,3]])

   one  three  four  seven    six
a  1.0   10.0  11.0      7  False
b  NaN   20.0   NaN      7  False
another way
   one  three  four  seven    six
c  3.0   30.0  33.0      7   True
d  NaN    NaN   NaN      7  False


In [None]:
df.loc[['a','b']]


Unnamed: 0,one,three,four,seven,six
a,1.0,10.0,11.0,7,False
b,,20.0,,7,False


## transpose a data frame

In [106]:
df

Unnamed: 0,one,NEW,three,four,six
a,1.0,11,10.0,11.0,False
b,,22,20.0,,False
c,3.0,33,30.0,33.0,True
d,,44,,,False
e,2.0,55,,,True


In [107]:
df.T

Unnamed: 0,a,b,c,d,e
one,1.0,,3.0,,2.0
NEW,11,22,33,44,55
three,10.0,20.0,30.0,,
four,11.0,,33.0,,
six,False,False,True,False,True


In [108]:
df

Unnamed: 0,one,NEW,three,four,six
a,1.0,11,10.0,11.0,False
b,,22,20.0,,False
c,3.0,33,30.0,33.0,True
d,,44,,,False
e,2.0,55,,,True


#more examples on creating dataframe thru dictionaries where keys in dictionaries are not matched

In [3]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'e', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df1 = pd.DataFrame(d)
print(df1)

   one  two
a  1.0  1.0
b  NaN  2.0
c  3.0  3.0
d  NaN  4.0
e  2.0  NaN


#if nested dictionary is passed then inner key is used as row name

In [109]:
pop = {'Nevada': {2001: 2.4, 2007: 2.9},  'Ohio': {2002: 1.5, 2007: 1.7, 2000: 3.6}}
df=pd.DataFrame(pop)
print(df)


      Nevada  Ohio
2001     2.4   NaN
2007     2.9   1.7
2002     NaN   1.5
2000     NaN   3.6


#naming index and columns object

In [110]:
df.index.name = 'year'; df.columns.name = 'state'
print(df)

state  Nevada  Ohio
year               
2001      2.4   NaN
2007      2.9   1.7
2002      NaN   1.5
2000      NaN   3.6


In [111]:
df

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,
2007,2.9,1.7
2002,,1.5
2000,,3.6


#As with Series, the values attribute returns the data contained in the DataFrame but as two-dimensional ndarray

In [90]:
df.values

array([[2.4, nan],
       [2.9, 1.7],
       [nan, 1.5],
       [nan, 3.6]])

In [88]:
df.values.dtype

dtype('float64')

In [112]:
df

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,
2007,2.9,1.7
2002,,1.5
2000,,3.6


#If the DataFrame’s columns are different dtypes, the dtype of the values array will be chosen to accommodate all of the columns

In [113]:
df['description']='good'


In [114]:
df

state,Nevada,Ohio,description
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,2.4,,good
2007,2.9,1.7,good
2002,,1.5,good
2000,,3.6,good


In [115]:
df.columns

Index(['Nevada', 'Ohio', 'description'], dtype='object', name='state')

In [97]:
df['description']

2001    good
2007    good
2002    good
2000    good
Name: description, dtype: object

In [116]:
df[df.columns[2]]

year
2001    good
2007    good
2002    good
2000    good
Name: description, dtype: object

In [94]:
df.values


array([[2.4, nan, 'good'],
       [2.9, 1.7, 'good'],
       [nan, 1.5, 'good'],
       [nan, 3.6, 'good']], dtype=object)

#accessing all index lables and that of specific index position 

In [5]:
df.index


Int64Index([2001, 2007, 2002, 2000], dtype='int64')

In [117]:
df.index[1]

2007

In [6]:
df.columns

Index(['Nevada', 'Ohio'], dtype='object')

#changing columns labels needs same size list as in the dataframe

In [102]:
df

Unnamed: 0,Nevada,Ohio,description
2001,2.4,,good
2007,2.9,1.7,good
2002,,1.5,good
2000,,3.6,good


In [118]:
df.columns=['X','Y','Z']

#renaming a particular column  is not allowed

In [121]:
df.columns[1]='zz'

TypeError: Index does not support mutable operations

In [122]:
df

Unnamed: 0_level_0,X,Y,Z
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,2.4,,good
2007,2.9,1.7,good
2002,,1.5,good
2000,,3.6,good


## index object is immutable

In [123]:
df.index

Int64Index([2001, 2007, 2002, 2000], dtype='int64', name='year')

In [124]:
df.index=[1900,2000,2010,2020]
df

Unnamed: 0,X,Y,Z
1900,2.4,,good
2000,2.9,1.7,good
2010,,1.5,good
2020,,3.6,good


#relabelling a partcular row index is not allowed: immutable

In [None]:
df.index[1]=2010


# creating index object explicitly and use in many data structures

In [9]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

# reusing the created index object

---



In [10]:
l=[[1,'a',30],[2,'b',25],[4,'c',22]]
df1=pd.DataFrame(l,index=labels,columns=['sno','name','age'])
print(df1)

   sno name  age
0    1    a   30
1    2    b   25
2    4    c   22


In [12]:
df1.index is labels

True

## Access df contents (records/rows) using index values or row names or index labels

In [7]:
df1

Unnamed: 0,sno,name,age
0,1,a,30
1,2,b,25
2,4,c,22


In [8]:
print(df1[2:4])
print(len(df1))
print("total cols=",len(df1.columns))
print("total rows=",len(df1.index))
print(df1.count(1)) #count of cols per row
print(df1.count(0)) #count of rows per column


   sno name  age
2    4    c   22
3
total cols= 3
total rows= 3
0    3
1    3
2    3
dtype: int64
sno     3
name    3
age     3
dtype: int64


#accessing shape of the data frame

In [17]:
df1

Unnamed: 0,one,two
a,1.0,1.0
b,,2.0
c,3.0,3.0
d,,4.0
e,2.0,


In [9]:
row, col = df1.shape
print(row,col,df1.size)

3 3 9


# Problem 1: create a data frame from a nested data dictionary and add a new column 'new' having sum of any two columns in the data frame. Check for data compatibility for adding/concatenation. also rename the  columns labels

In [16]:
df

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
b,,2.0,20.0
c,3.0,3.0,30.0
d,,4.0,
e,2.0,,


In [15]:
df.columns

Index(['one', 'two', 'three'], dtype='object')

In [21]:
df['X']

1900    2.4
2000    2.9
2010    NaN
2020    NaN
Name: X, dtype: float64

In [None]:
str(df.loc[2000,'X'])

## adding a new row

In [18]:
df.loc[2006]=[2,None,4]
df

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
b,,2.0,20.0
c,3.0,3.0,30.0
d,,4.0,
e,2.0,,
2006,2.0,,4.0


## replacing an existing row

In [19]:
df.iloc[3]=[34,45,56]
df

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
b,,2.0,20.0
c,3.0,3.0,30.0
d,34.0,45.0,56.0
e,2.0,,
2006,2.0,,4.0


In [20]:
df.columns

Index(['one', 'two', 'three'], dtype='object')

In [21]:
df.append(pd.Series([1,2,3],index=df.columns,name='g'))

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
b,,2.0,20.0
c,3.0,3.0,30.0
d,34.0,45.0,56.0
e,2.0,,
2006,2.0,,4.0
g,1.0,2.0,3.0


## without index, values in series are used as new column values where number of columns= number of values and column names are default index

In [27]:
df.append(pd.Series([1,2,3],name=2091))

Unnamed: 0,X,Y,Z,0,1,2
1900,2.4,,good,,,
2000,2.9,1.7,good,,,
2010,,1.5,good,,,
2020,34.0,45.0,56,,,
2091,,,,1.0,2.0,3.0


In [25]:
df.append(pd.Series(np.nan,index=df.columns,name='h'))

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
b,,2.0,20.0
c,3.0,3.0,30.0
d,34.0,45.0,56.0
e,2.0,,
2006,2.0,,4.0
h,,,


In [14]:
df

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
b,,2.0,20.0
c,3.0,3.0,30.0
d,,4.0,
e,2.0,,


## extracts rows having atleast one valid values

In [15]:
~df.isna()

Unnamed: 0,one,two,three
a,True,True,True
b,False,True,True
c,True,True,True
d,False,True,False
e,True,False,False


In [32]:
df['stud_avg']=df.mean(axis=1)

In [33]:
df

Unnamed: 0,one,two,three,stud_avg
a,1.0,1.0,10.0,4.0
b,,2.0,20.0,11.0
c,3.0,3.0,30.0,12.0
d,,4.0,,4.0
e,2.0,,,2.0


In [18]:
df[~df.isna()]

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
b,,2.0,20.0
c,3.0,3.0,30.0
d,,4.0,
e,2.0,,


In [26]:
df.sum(axis=1)

a    12.0
b    22.0
c    36.0
d     4.0
e     2.0
dtype: float64

### what is being checked here?

In [23]:
df[df.one.notna() & df.two.notna()  & df.three.notna()]

Unnamed: 0,one,two,three
a,1.0,1.0,10.0
c,3.0,3.0,30.0


## find sum of values of rows with all valid values

In [27]:
df[df.isna().sum(axis=1)==0].sum(axis=1)

a    12.0
c    36.0
dtype: float64