In [18]:
import pandas as pd
import numpy as np

In [14]:
# a DataFrame is an analog of a two-dimensional numpy array with both flexible 
# row indices and flexible column names
# and in other ways like a dictionary of Series structures sharing the same index


In [16]:
# Any list of dictionaries can be made into a DataFrame
# Even if some keys in the dictionary are missing, Pandas will fill them in with NaN

data = pd.DataFrame([{'a': 1, 'b': 2}, 
                     {'b': 3, 'c': 4}])
data

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [19]:
# Dataframe from a two-dimensional NumPy array

data = pd.DataFrame(np.random.rand(3, 2), 
                    columns=['foo', 'bar'],
                    index=['a', 'b', 'c']
                   )

data

Unnamed: 0,foo,bar
a,0.235275,0.812488
b,0.030859,0.578347
c,0.270735,0.334705


In [20]:
# Index object follow Python’s built-in set data structure, 
# so that unions, intersections, differences can be computed in a familiar way

indA = pd.Index([1, 3, 5, 7, 9]) 
indB = pd.Index([2, 3, 5, 7, 11])

In [21]:
# Intersection

indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [23]:
# Union

indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [24]:
# Set difference

indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

In [30]:
data.values

array([[0.23527547, 0.81248754],
       [0.03085866, 0.57834742],
       [0.27073483, 0.33470525]])

In [74]:
data.columns

Index(['area', 'pop', 'density'], dtype='object')

In [34]:
# List data items
list(data.items())

[('foo',
  a    0.235275
  b    0.030859
  c    0.270735
  Name: foo, dtype: float64),
 ('bar',
  a    0.812488
  b    0.578347
  c    0.334705
  Name: bar, dtype: float64)]

In [40]:
# Slicing
# Notie that when you are slicing with an explicit index (i.e., data['a':'c']), 
# the final index is included in the slice, while when you’re slicing with an implicit 
# index (i.e., data[0:2]), the final index is excluded from the slice.
data['bar']['a':'b']

a    0.812488
b    0.578347
Name: bar, dtype: float64

In [45]:
# Transpose
data.T

Unnamed: 0,a,b,c
foo,0.235275,0.030859,0.270735
bar,0.812488,0.578347,0.334705


In [50]:
# Array-style indexing with iloc to acess row and column indexs
data.iloc[:2,:1]

Unnamed: 0,foo
a,0.235275
b,0.030859


In [51]:
# Array-style indexing with loc to acess row and column labels
data.loc[:'b', :'foo']

Unnamed: 0,foo
a,0.235275
b,0.030859


In [53]:
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [54]:
# With loc access rows (also with masking) then columns by name
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [59]:
data[data['density']>100]['pop']

New York    19651127
Florida     19552860
Name: pop, dtype: int64

In [71]:
# Access first x rows
data.head(1)

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
