## Pandas Library

DataFrames -> essentially multidimensonal arrays with attached row and column labels, and often with heterogeneous types and/or missing data.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd

Pandas Series Object:

In [2]:
# Basically a one-dimensional array of indexed data
# Can be created from a list or array

data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)

# Values
print(data.values)

# Indexes
print(data.index)

# Acesing data
data[1:3]

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


1    0.50
2    0.75
dtype: float64

In [3]:
# The indexes in pandas are explicit and does not need to be an integer

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data['b']

0.5

In [4]:
# Series-as-dictionary analogy

population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}

population = pd.Series(population_dict)
population

# Typical dictionary-style item access cn be performed
population['California']

# Slicing operation
population['California':'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

Pandas DataFrame Object:

In [5]:
# DataFrame as a generalized NumPy array

area_dict = {'California': 423967, 'Texas': 695662,
             'New York': 141297, 'Florida': 170312,
             'Illinois': 149995}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [6]:
# A two dimensional object with population and area Series

states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [7]:
# Index and columns atributtes

print(states.index)

print(states.columns)


Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [8]:
# DataFrame as specialized dictionary

states['area']


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

Pandas Index Object

In [9]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [10]:
# Some attributes of index
print(ind.size, ind.shape, ind.ndim, ind.dtype)

# OBS:
# Index cannot be modified via normal operations

# Index as ordered set
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

# Index Intersection
print(indA.intersection(indB))

# Index union
print(indA.union(indB))

# Index symmetric difference
indA.symmetric_difference(indB)

5 (5,) 1 int64
Int64Index([3, 5, 7], dtype='int64')
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')


Int64Index([1, 2, 9, 11], dtype='int64')

# Data Selection in Series

In [12]:
# Series as dictionary

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])

print(data['b'])

# Examining keys/indices and values
print('a' in data)

print(data.keys())

list(data.items())

# Extending a Series by assigning a new index value
data['e'] = 1.25

data

0.5
True
Index(['a', 'b', 'c', 'd'], dtype='object')


a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [21]:
# Series as one-dimesional array

# Slicing by explicit index
data['a':'c']

# Slicing by implicit integer index
data[0:2]

# Masking
data[(data > 0.3) & (data < 0.8)]

# Fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

# Data Selection in DataFrame

In [38]:
# DataFrame as dictionary
states

# Acessing columns via dictionary-style
states['area']

# Adding a new column
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [41]:
# DataFrame as two-dimensional array

# Examine the raw data
print(states.values)

# Transpose the full DataFrame
print(states.T)




[[3.83325210e+07 4.23967000e+05 9.04139261e+01]
 [2.64481930e+07 6.95662000e+05 3.80187404e+01]
 [1.96511270e+07 1.41297000e+05 1.39076746e+02]
 [1.95528600e+07 1.70312000e+05 1.14806121e+02]
 [1.28821350e+07 1.49995000e+05 8.58837628e+01]]


Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


## Combining Datasets: Concat and Append