# Chapter 5: Pandas

In [4]:
import pandas as pd
from pandas import Series, DataFrame

## Series

In [5]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.values

array([ 4,  7, -5,  3])

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
obj2 = pd.Series([4,7,-5,3], index = ['a', 'b', 'c', 'd'])
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
obj2['b']

7

In [11]:
obj2[['a','c']]

a    4
c   -5
dtype: int64

In [12]:
obj2[['a','c']].values

array([ 4, -5])

In [13]:
obj2[obj2>0]

a    4
b    7
d    3
dtype: int64

In [14]:
obj2>0

a     True
b     True
c    False
d     True
dtype: bool

In [15]:
'c' in obj2

True

In [16]:
'e' in obj2

False

In [18]:
#convert dictionary to seris
sdata = {'MA': 3500, 'OH': 200, 'WA': 50}
obj3 = pd.Series(sdata)
obj3

MA    3500
OH     200
WA      50
dtype: int64

In [19]:
state = ['OH', 'CA', 'MA']
obj4 = pd.Series(sdata, index = state)
obj4

OH     200.0
CA       NaN
MA    3500.0
dtype: float64

In [21]:
pd.isnull(obj4)

OH    False
CA     True
MA    False
dtype: bool

In [22]:
obj4.isnull()

OH    False
CA     True
MA    False
dtype: bool

In [23]:
obj3

MA    3500
OH     200
WA      50
dtype: int64

In [24]:
obj4

OH     200.0
CA       NaN
MA    3500.0
dtype: float64

In [25]:
obj3 + obj4

CA       NaN
MA    7000.0
OH     400.0
WA       NaN
dtype: float64

In [26]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
OH     200.0
CA       NaN
MA    3500.0
Name: population, dtype: float64

In [27]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [28]:
obj.index = ['bob', 'steve', 'jeff', 'ryan']
obj

bob      4
steve    7
jeff    -5
ryan     3
dtype: int64

## Dataframe

In [29]:
data = {'state': ['ohio', 'ohio', 'ohio', 'nevada', 'nevada', 'nevada'],
        'year': [2000, 2001, 2002, 2001,2002, 2003],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,ohio,2000
1,1.7,ohio,2001
2,3.6,ohio,2002
3,2.4,nevada,2001
4,2.9,nevada,2002
5,3.2,nevada,2003


In [30]:
frame.head(2)

Unnamed: 0,pop,state,year
0,1.5,ohio,2000
1,1.7,ohio,2001


In [31]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop'])
frame2

Unnamed: 0,year,state,pop
0,2000,ohio,1.5
1,2001,ohio,1.7
2,2002,ohio,3.6
3,2001,nevada,2.4
4,2002,nevada,2.9
5,2003,nevada,3.2


In [34]:
frame3 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame3

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,
three,2002,ohio,3.6,
four,2001,nevada,2.4,
five,2002,nevada,2.9,
six,2003,nevada,3.2,


In [36]:
frame3.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [38]:
list(frame3.columns)

['year', 'state', 'pop', 'debt']

In [40]:
frame3['state']

one        ohio
two        ohio
three      ohio
four     nevada
five     nevada
six      nevada
Name: state, dtype: object

In [42]:
frame3.loc['three']

year     2002
state    ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [43]:
frame3['debt'] = 16.5
frame3

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,16.5
two,2001,ohio,1.7,16.5
three,2002,ohio,3.6,16.5
four,2001,nevada,2.4,16.5
five,2002,nevada,2.9,16.5
six,2003,nevada,3.2,16.5


In [44]:
frame3['debt'] = list(range(6))
frame3

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,0
two,2001,ohio,1.7,1
three,2002,ohio,3.6,2
four,2001,nevada,2.4,3
five,2002,nevada,2.9,4
six,2003,nevada,3.2,5


In [45]:
val = pd.Series([-1.2, -1.5, -1.7], index= ['two', 'four', 'five'])
frame3['debt'] = val
frame3

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,-1.2
three,2002,ohio,3.6,
four,2001,nevada,2.4,-1.5
five,2002,nevada,2.9,-1.7
six,2003,nevada,3.2,


In [46]:
frame3['bool'] = frame3['state'] == 'Ohio'
frame3

Unnamed: 0,year,state,pop,debt,bool
one,2000,ohio,1.5,,False
two,2001,ohio,1.7,-1.2,False
three,2002,ohio,3.6,,False
four,2001,nevada,2.4,-1.5,False
five,2002,nevada,2.9,-1.7,False
six,2003,nevada,3.2,,False


In [47]:
del frame3['bool']
frame3

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,-1.2
three,2002,ohio,3.6,
four,2001,nevada,2.4,-1.5
five,2002,nevada,2.9,-1.7
six,2003,nevada,3.2,


In [48]:
#for nested dict, pandas will interpret the outer dict keys as the columns and the inner keys as the row indecs
pop = {'nevada': {2001: 2.4, 2002: 2.9},
      'ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame4 = pd.DataFrame(pop)
frame4

Unnamed: 0,nevada,ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [49]:
frame4.T

Unnamed: 0,2000,2001,2002
nevada,,2.4,2.9
ohio,1.5,1.7,3.6


In [50]:
frame4.ohio[:-1]

2000    1.5
2001    1.7
Name: ohio, dtype: float64

In [53]:
frame4.index.name = 'year'
frame4.columns.name = 'state'
frame4

state,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [54]:
#The value attributes returns the data contained in the df as a two dimensional ndarray
frame4.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])