# Introducing Pandas Objects

At the very basic level, Pandas objects can be thought of as enhanced versions of
NumPy structured arrays in which the rows and columns are identified with labels
rather than simple integer indices. 

In [112]:
import numpy as np
import pandas as pd

## The Pandas Series Object


In [113]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [114]:
# values
data.values


array([0.25, 0.5 , 0.75, 1.  ])

In [115]:
# indexes
data.index

RangeIndex(start=0, stop=4, step=1)

In [116]:
# creating series using python List

data = pd.Series([1, 2, 3, 4],index=['a', 'b', 'c', 'd'])
data


a    1
b    2
c    3
d    4
dtype: int64

In [117]:
data['a']

1

### Series as specialized dictionary


In [118]:
population_dict = { 'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois': 12882135
                  }
population = pd.Series(population_dict)
population


California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [119]:
population['California']


38332521

In [120]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

### Constructing Series objects

In [121]:
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [122]:
# Data can be a scalar, which is repeated to fill the specified index
pd.Series(5, index=[100, 200, 300])


100    5
200    5
300    5
dtype: int64

In [123]:
# Data can be a dictionary, in which index defaults to the sorted dictionary keys
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

## The Pandas DataFrame Object


### DataFrame as a generalized NumPy array



In [124]:
population_dict = { 'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois': 12882135
                  }


area_dict = {
              'California': 423967,
              'Texas': 695662,
              'New York': 141297,
              'Florida': 170312,
              'Illinois': 149995
            }
# Creating DF with Series object

populationSeries = pd.Series(population_dict)
areaSeries = pd.Series(area_dict)

statesDF = pd.DataFrame( {'population': populationSeries, 'area': areaSeries} )
statesDF

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [125]:
# Indexes 

statesDF.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [126]:
# Columns (column Indexes)

statesDF.columns

Index(['population', 'area'], dtype='object')

In [127]:
# all Axes 

statesDF.axes

[Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object'),
 Index(['population', 'area'], dtype='object')]

### DataFrame as specialized dictionary


In [128]:
areaSeries : pd.core.series.Series = statesDF['area']
areaSeries


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Constructing DataFrame objects

A Pandas DataFrame can be constructed in a variety of ways. Here we’ll give several
examples.

In [129]:
pd.DataFrame(populationSeries, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [130]:
pd.DataFrame({'population': populationSeries,'area': areaSeries})


Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [131]:
pd.DataFrame(
data = np.random.rand(3, 2),
columns = ['col 1', 'col 2'],
index = ['a', 'b', 'c'])


Unnamed: 0,col 1,col 2
a,0.659068,0.04657
b,0.432974,0.979673
c,0.519916,0.321273


## The Pandas Index Object
We have seen here that both the Series and DataFrame objects contain an explicit
index that lets you reference and modify data. This Index object is an interesting
structure in itself, and it can be thought of either as an immutable array or as an
ordered set (technically a multiset, as Index objects may contain repeated values).
Those views have some interesting consequences in the operations available on Index
objects. As a simple example, let’s construct an Index from a list of integers

In [132]:
ind = pd.Index([2, 3, 5, 7, 11])
ind # One difference between Index objects and NumPy arrays is that indices are immutable


Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [133]:
## Index as ordered set


In [134]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])


In [135]:
indA & indB # intersection

Int64Index([3, 5, 7], dtype='int64')

In [136]:
indA | indB # union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [137]:
indA ^ indB # symmetric difference

Int64Index([1, 2, 9, 11], dtype='int64')

## Data Indexing and Selection


In [138]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995
                 }
                )

pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135
                })

data = pd.DataFrame({'area':area, 'pop':pop})
data


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [139]:
# The individual Series that make up the columns of the DataFrame can be accessed
# via dictionary-style indexing of the column name

data['area']


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [140]:
data.area # doesnt work for column names contining space also attributes name conflicts


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [141]:
data.area is data['area']

True

In [142]:
# DataFrame has a pop() method
data.pop is data['pop']


False

In [143]:
# adding new column 

data['density'] = data['pop'] / data['area']
data



Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [144]:
#another option using .insert()
data.insert(2,'density_insert' , value = data['pop'] / data['area'])
data

Unnamed: 0,area,pop,density_insert,density
California,423967,38332521,90.413926,90.413926
Texas,695662,26448193,38.01874,38.01874
New York,141297,19651127,139.076746,139.076746
Florida,170312,19552860,114.806121,114.806121
Illinois,149995,12882135,85.883763,85.883763


### DataFrame as two-dimensional array


In [145]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01, 8.58837628e+01]])

In [146]:
# Transpose 
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density_insert,90.41393,38.01874,139.0767,114.8061,85.88376
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [147]:
data.values[0] #0 row

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01, 9.04139261e+01])

In [148]:
data.values[0][1] #0 row, #1 col

38332521.0

In [149]:
data['area'] # series!


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### loc, iloc, and ix indexers

In [152]:
# row, col
data.iloc[0:3, 0:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [151]:
data.loc[:'Illinois', :'pop']


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135
