In [2]:
# In this long introduction, we willimport numpy (as np) and pandas (as pd), and work extensively with different
# kinds of data array (structures).
import pandas as pd

In [3]:
obj3 = pd.Series(["blue","purple","yellow"], index = [0,2,4])

In [4]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [5]:
obj3.reindex(range(6), method = "ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [6]:
import numpy as np

In [7]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
                    index = ["a","c","d"],
                    columns = ["Ohio", "Texas", "California"] )

In [8]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [9]:
frame2 = frame.reindex(["a","b","c","d"])

In [10]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [11]:
states = ["New York","New Hampshire","New Jersey"]

In [12]:
frame2.reindex(columns=states)

Unnamed: 0,New York,New Hampshire,New Jersey
a,,,
b,,,
c,,,
d,,,


In [13]:
frame.reindex(columns=states)

Unnamed: 0,New York,New Hampshire,New Jersey
a,,,
c,,,
d,,,


In [14]:
obj = pd.Series(np.arange(5.), index=["a","b","c","d","e"])

In [15]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [16]:
new_obj = obj.drop("c")

In [17]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [18]:
obj.drop(["d","b"])

a    0.0
c    2.0
e    4.0
dtype: float64

In [19]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index = ["Ohio","Colorado","Utah","New York"],
                   columns = ["One","Two","Three","Four"])

In [20]:
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [21]:
data.drop("Two",axis=1)

Unnamed: 0,One,Three,Four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [22]:
data.drop(["One","Three"], axis="columns")

Unnamed: 0,Two,Four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [23]:
obj.drop("c",inplace=True)

In [24]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [25]:
# inplace destroys any data that is dropped by this operation.

In [26]:
# let us create a dataset so that we can subset/loc/iloc data

In [27]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index = ["Ohio", "New York", "New Jersey", "Massachusetts"],
                   columns = ["one","two","three","four"])

In [28]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
New York,4,5,6,7
New Jersey,8,9,10,11
Massachusetts,12,13,14,15


In [29]:
data["two"]

Ohio              1
New York          5
New Jersey        9
Massachusetts    13
Name: two, dtype: int64

In [30]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
New York,4,5,6,7


In [31]:
data[["three","two"]]

Unnamed: 0,three,two
Ohio,2,1
New York,6,5
New Jersey,10,9
Massachusetts,14,13


In [32]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
New York,4,5,6,7
New Jersey,8,9,10,11
Massachusetts,12,13,14,15


In [33]:
# Say we have data on the year-on-year growth rate of spending in four states. We manually create the dataset as follows:

In [43]:
states_growth = pd.DataFrame(np.random.randn(4,3), columns=["2014","2015","2016"],
                            index=["Utah","Oregon","Wisconsin","Ohio"])

In [44]:
states_growth

Unnamed: 0,2014,2015,2016
Utah,0.005554,0.453342,0.126097
Oregon,-0.303922,-2.199613,0.491619
Wisconsin,1.219826,0.11028,0.781662
Ohio,-1.185433,-0.972075,0.507004


In [48]:
np.abs(states_growth)  # Say we are only interested in the magnitude and NOT the sign (for some reason)

Unnamed: 0,2014,2015,2016
Utah,0.005554,0.453342,0.126097
Oregon,0.303922,2.199613,0.491619
Wisconsin,1.219826,0.11028,0.781662
Ohio,1.185433,0.972075,0.507004


In [37]:
# Sometimes it is desired to apply operations to a one-dimensional array. In such cases, apply function kicks in:

In [49]:
f = lambda x: x.max() - x.min()  # remember lambda x: takes x as the argument

In [51]:
states_growth.apply(f)

2014    2.405259
2015    2.652955
2016    0.655565
dtype: float64

In [52]:
# It is then reasonable to understand the apply(f) in this case as an average measure of the magnitude of growth/decline. Is it useful? Who knows. But at least we have a reasonable interpretation of the statistic.

In [54]:
states_growth.iloc[[1,2]]  # selecting by rows: 1st (Oregon; remember Python starts counting from 0) and 2nd (Wisconsin)

Unnamed: 0,2014,2015,2016
Oregon,-0.303922,-2.199613,0.491619
Wisconsin,1.219826,0.11028,0.781662


In [57]:
states_growth.iloc[[0,1],[2,0,1]]

Unnamed: 0,2016,2014,2015
Utah,0.126097,0.005554,0.453342
Oregon,0.491619,-0.303922,-2.199613


In [77]:
states_growth.loc[:"Oregon","2015"]

Utah      0.453342
Oregon   -2.199613
Name: 2015, dtype: float64

In [87]:
states_growth.iloc[:,:2][states_growth["2016"] > 0.5]  # select the first two columns first; then, look for the states which achieved more than 0.5% growth in 2016.

Unnamed: 0,2014,2015
Wisconsin,1.219826,0.11028
Ohio,-1.185433,-0.972075


In [73]:
states_growth.columns.tolist() # this is doable, though the motivation for doing so is unclear.

['2014', '2015', '2016']

In [88]:
states_growth.loc[:,"2016"]

Utah         0.126097
Oregon       0.491619
Wisconsin    0.781662
Ohio         0.507004
Name: 2016, dtype: float64