In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Pandas Series
- Series is a one-dimensional labeled array capable of holding any data type
- The axis labels are collectively referred to as the index.


In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)

print(s.index)

a   -0.546608
b    0.286914
c   -0.356206
d    0.224963
e   -1.226267
dtype: float64
Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')


In [4]:
d = {'a' : 0., 'b' : 1., 'c' : 2.}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [5]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [6]:
# access element
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)

print(s['a'])

a   -1.156020
b   -0.837068
c   -0.526796
d   -0.550375
e   -1.064490
dtype: float64
-1.15601961514


In [9]:
# we can perform nupmy operations on series
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)

print(np.sum(s))

a   -0.725443
b    0.140700
c   -1.168238
d    0.385949
e    1.073938
dtype: float64
-0.293093370756


## DataFrame
- DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
- You can think of it like a spreadsheet or SQL table, or a dict of Series objects.
- we can create pandas from 
    - From dict of Series
    - From dict of dict
    - From dict of ndarrays / lists
    - From structured or record array
    - From a list of dicts
    - From a dict of tuples
    - 

In [16]:
# create dict of series

d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

print(d)

# convert dict to df
df = pd.DataFrame(d)
print(df)


# choose specific index
df = pd.DataFrame(d, index=['d', 'b', 'a'])
print(df)

# choose specific columns
df = pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
print(df)

{'two': a    1.0
b    2.0
c    3.0
d    4.0
dtype: float64, 'one': a    1.0
b    2.0
c    3.0
dtype: float64}
   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0
   one  two
d  NaN  4.0
b  2.0  2.0
a  1.0  1.0
   two three
d  4.0   NaN
b  2.0   NaN
a  1.0   NaN


In [17]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)

print(df.index)
print(df.columns)

Index([u'a', u'b', u'c', u'd'], dtype='object')
Index([u'one', u'two'], dtype='object')


## Column selection, addition, deletion

In [23]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)

# column selection
print(df['two'])
print(df['two']['b'])

# column addition
df['three'] = df['one'] * df['two']
print(df)

# deleting column
del df['two']
print(df)

a    1.0
b    2.0
c    3.0
d    4.0
Name: two, dtype: float64
2.0
   one  two  three
a  1.0  1.0    1.0
b  2.0  2.0    4.0
c  3.0  3.0    9.0
d  NaN  4.0    NaN
   one  three
a  1.0    1.0
b  2.0    4.0
c  3.0    9.0
d  NaN    NaN


## pandas read csv

In [30]:
iris = pd.read_csv('IRIS.csv')
iris.head()

Unnamed: 0,5.1,0.222222222,3.5,0.625,1.4,0.06779661,0.2,0.041666667,setosa
0,4.9,0.166667,3.0,0.416667,1.4,0.067797,0.2,0.041667,setosa
1,4.7,0.111111,3.2,0.5,1.3,0.050847,0.2,0.041667,setosa
2,4.6,0.083333,3.1,0.458333,1.5,0.084746,0.2,0.041667,setosa
3,5.0,0.194444,3.6,0.666667,1.4,0.067797,0.2,0.041667,setosa
4,5.4,0.305556,3.9,0.791667,1.7,0.118644,0.4,0.125,setosa


### Assigning New Columns in Method Chains

In [25]:
iris = pd.read_csv('iris.data')

iris.assign(sepal_ratio = iris['SepalWidth'] / iris['SepalLength'])

iris.head()

KeyError: 'SepalWidth'