# Introducing Pandas Objects

At the very basic level, Pandas objects can be thought of as enhanced versions of
NumPy structured arrays in which the rows and columns are identified with labels
rather than simple integer indices. 

In [1]:
import numpy as np
import pandas as pd

## The Pandas Series Object


In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
# values
data.values


array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
# indexes
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# creating series using python List

data = pd.Series([1, 2, 3, 4],index=['a', 'b', 'c', 'd'])
data


a    1
b    2
c    3
d    4
dtype: int64

In [6]:
data['a']

1

### Series as specialized dictionary


In [7]:
population_dict = { 'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois': 12882135
                  }
population = pd.Series(population_dict)
population


California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [8]:
population['California']


38332521

In [9]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

### Constructing Series objects

In [10]:
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [11]:
# Data can be a scalar, which is repeated to fill the specified index
pd.Series(5, index=[100, 200, 300])


100    5
200    5
300    5
dtype: int64

In [12]:
# Data can be a dictionary, in which index defaults to the sorted dictionary keys
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

## The Pandas DataFrame Object


### DataFrame as a generalized NumPy array



In [13]:
population_dict = { 'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois': 12882135
                  }


area_dict = {
              'California': 423967,
              'Texas': 695662,
              'New York': 141297,
              'Florida': 170312,
              'Illinois': 149995
            }
# Creating DF with Series object

populationSeries = pd.Series(population_dict)
areaSeries = pd.Series(area_dict)

statesDF = pd.DataFrame( {'population': populationSeries, 'area': areaSeries} )
statesDF

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [14]:
# Indexes 

statesDF.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [15]:
# Columns (column Indexes)

statesDF.columns

Index(['population', 'area'], dtype='object')

In [16]:
# all Axes 

statesDF.axes

[Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object'),
 Index(['population', 'area'], dtype='object')]

### DataFrame as specialized dictionary


In [17]:
areaSeries : pd.core.series.Series = statesDF['area']
areaSeries


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Constructing DataFrame objects

A Pandas DataFrame can be constructed in a variety of ways. Here we’ll give several
examples.

In [18]:
pd.DataFrame(populationSeries, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [19]:
pd.DataFrame({'population': populationSeries,'area': areaSeries})


Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [20]:
pd.DataFrame(
data = np.random.rand(3, 2),
columns = ['col 1', 'col 2'],
index = ['a', 'b', 'c'])


Unnamed: 0,col 1,col 2
a,0.21739,0.448171
b,0.442318,0.070065
c,0.554665,0.609844


## The Pandas Index Object
We have seen here that both the Series and DataFrame objects contain an explicit
index that lets you reference and modify data. This Index object is an interesting
structure in itself, and it can be thought of either as an immutable array or as an
ordered set (technically a multiset, as Index objects may contain repeated values).
Those views have some interesting consequences in the operations available on Index
objects. As a simple example, let’s construct an Index from a list of integers

In [21]:
ind = pd.Index([2, 3, 5, 7, 11])
ind # One difference between Index objects and NumPy arrays is that indices are immutable


Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [22]:
## Index as ordered set


In [23]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])


In [24]:
indA & indB # intersection

Int64Index([3, 5, 7], dtype='int64')

In [25]:
indA | indB # union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [26]:
indA ^ indB # symmetric difference

Int64Index([1, 2, 9, 11], dtype='int64')

## Data Indexing and Selection


In [27]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995
                 }
                )

pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135
                })

data = pd.DataFrame({'area':area, 'pop':pop})
data


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [28]:
# The individual Series that make up the columns of the DataFrame can be accessed
# via dictionary-style indexing of the column name

data['area']


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [29]:
data.area # doesnt work for column names contining space also attributes name conflicts


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [30]:
data.area is data['area']

True

In [31]:
# DataFrame has a pop() method
data.pop is data['pop']


False

In [32]:
# adding new column 

data['density'] = data['pop'] / data['area']
data



Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [33]:
#another option using .insert()
data.insert(2,'density_insert' , value = data['pop'] / data['area'])
data

Unnamed: 0,area,pop,density_insert,density
California,423967,38332521,90.413926,90.413926
Texas,695662,26448193,38.01874,38.01874
New York,141297,19651127,139.076746,139.076746
Florida,170312,19552860,114.806121,114.806121
Illinois,149995,12882135,85.883763,85.883763


### DataFrame as two-dimensional array


In [34]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01, 8.58837628e+01]])

In [35]:
# Transpose 
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density_insert,90.41393,38.01874,139.0767,114.8061,85.88376
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [36]:
data.values[0] #0 row

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01, 9.04139261e+01])

In [37]:
data.values[0][1] #0 row, #1 col

38332521.0

In [38]:
data['area'] # series!


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### loc, iloc, and ix indexers

In [39]:
# row, col
data.iloc[0:3, 0:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [40]:
data.loc[:'Illinois', :'pop']


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [41]:
data.loc[data.density > 100, ['pop', 'density']]




Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [42]:
# Any of these indexing conventions may also be used to set or modify values

data.iloc[0, 2] = 90
data


Unnamed: 0,area,pop,density_insert,density
California,423967,38332521,90.0,90.413926
Texas,695662,26448193,38.01874,38.01874
New York,141297,19651127,139.076746,139.076746
Florida,170312,19552860,114.806121,114.806121
Illinois,149995,12882135,85.883763,85.883763


### Additional indexing conventions


In [43]:
data['Florida':'Illinois']


Unnamed: 0,area,pop,density_insert,density
Florida,170312,19552860,114.806121,114.806121
Illinois,149995,12882135,85.883763,85.883763


In [44]:
data[1:3]

Unnamed: 0,area,pop,density_insert,density
Texas,695662,26448193,38.01874,38.01874
New York,141297,19651127,139.076746,139.076746


In [45]:
data[data.density > 100]


Unnamed: 0,area,pop,density_insert,density
New York,141297,19651127,139.076746,139.076746
Florida,170312,19552860,114.806121,114.806121


## Operating on Data in Pandas

### UFuncs: Index Alignment

For binary operations on two Series or DataFrame objects, Pandas will align indices
in the process of performing the operation. This is very convenient when you are
working with incomplete data


In [46]:
area = pd.Series(
    {'Alaska': 1723337,
     'Texas': 695662,
     'California': 423967
    }, name='area')

population = pd.Series(
    {'California': 38332521,
     'Texas': 26448193,
     'New York': 19651127
    }, name='population')

population / area # The resulting array contains the union of indices of the two input arrays, which we
                  # could determine using standard Python set arithmetic on these indices:
                  #  area.index | population.index



Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [47]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [48]:
# or remove NaN like this 
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame


In [49]:
A = pd.DataFrame(np.random.randint(0, 20, (2, 2)),columns=list('AB'))
A

Unnamed: 0,A,B
0,19,13
1,10,3


In [50]:
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)),columns=list('BAC'))
B


Unnamed: 0,B,A,C
0,9,4,1
1,2,6,6
2,4,5,2


In [51]:
A + B

Unnamed: 0,A,B,C
0,23.0,22.0,
1,16.0,5.0,
2,,,


### Ufuncs: Operations Between DataFrame and Series


In [52]:
A = np.random.randint(10, size=(3, 4))
A

array([[5, 1, 8, 7],
       [6, 9, 7, 1],
       [9, 9, 1, 4]])

In [53]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 1,  8, -1, -6],
       [ 4,  8, -7, -3]])

In [54]:
# In Pandas, the convention similarly operates row-wise by default:
df = pd.DataFrame(A, columns=list('QRST'))
df

Unnamed: 0,Q,R,S,T
0,5,1,8,7
1,6,9,7,1
2,9,9,1,4


In [55]:
print(f'  df.iloc[0] -> \n{df.iloc[0]}'  )
df - df.iloc[0]


  df.iloc[0] -> 
Q    5
R    1
S    8
T    7
Name: 0, dtype: int64


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,1,8,-1,-6
2,4,8,-7,-3


## Handling Missing Data


### **None**: Pythonic missing data
The first sentinel value used by Pandas is None, a Python singleton object that is often
used for missing data in Python code. Because None is a Python object, it cannot be
used in any arbitrary NumPy/Pandas array, but only in arrays with data type
'object' (i.e., arrays of Python objects):

In [56]:
vals1 = np.array([1, None, 3, 4])
vals1


array([1, None, 3, 4], dtype=object)

In [58]:
# Will Cause ERROR!
# vals1.sum()


### **NaN**: Missing numerical data
The other missing data representation, NaN (acronym for Not a Number), is different;
it is a special floating-point value recognized by all systems that use the standard
IEEE floating-point representation

In [59]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [61]:
# Any operration with NaN will result to NaN
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [63]:
# Therefore special NaN safe functions
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

### NaN and None in Pandas
NaN and None both have their place, and Pandas is built to handle the two of them
nearly interchangeably, converting between them where appropriate:


In [64]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [65]:
x = pd.Series(range(2), dtype=int)
x

0    0
1    1
dtype: int64

In [66]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

## Operating on Null Values


In [67]:
# isnull()
# Generate a Boolean mask indicating missing values

# notnull()
# Opposite of isnull()

# dropna()
# Return a filtered version of the data

# fillna()
# Return a copy of the data with missing values filled or imputed

In [68]:
data = pd.Series([1, np.nan, 'hello', None])


In [69]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [70]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [71]:
#Returns new DF with all NaN values removed
data.dropna() 


0        1
2    hello
dtype: object

In [72]:
df = pd.DataFrame([[1, np.nan, 2],[2, 3, 5],[np.nan, 4, 6]])
df


Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [73]:
# By default, dropna() will drop all rows in which any null value is present:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [77]:
# The default is how='any', such that any row or column (depending on the axis key‐
# word) containing a null value will be dropped. You can also specify how='all', which
# will only drop rows/columns that are all null values:

df[3] = np.nan # Creating new Row with column name 3 with NaN values
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [78]:
# Drop along axis 1 or 'column', only if ALL elements are NaN
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [80]:
df.iloc[1,1] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,,5,
2,,4.0,6,


In [95]:
# Remove any row along rows except any rows containing atleast 2 non Null values.
df.dropna(axis = 'rows', thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,,5,
2,,4.0,6,


In [97]:
# Remove any column along column except any column containing atleast 2 non Null values.
df.dropna(axis = 'columns', thresh=2)

Unnamed: 0,0,2
0,1.0,2
1,2.0,5
2,,6


In [82]:
# Filling null values
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [83]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [85]:
# forward-fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [86]:
# back-fill
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [87]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,,5,
2,,4.0,6,


In [88]:
df.fillna(method='ffill', axis=1)


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,2.0,5.0,5.0
2,,4.0,6.0,6.0
