In [1]:
# In this long introduction, we willimport numpy (as np) and pandas (as pd), and work extensively with different
# kinds of data array (structures).
import pandas as pd

In [2]:
obj3 = pd.Series(["blue","purple","yellow"], index = [0,2,4])

In [3]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [4]:
obj3.reindex(range(6), method = "ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [5]:
import numpy as np

In [6]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
                    index = ["a","c","d"],
                    columns = ["Ohio", "Texas", "California"] )

In [7]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [8]:
frame2 = frame.reindex(["a","b","c","d"])

In [9]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [10]:
states = ["New York","New Hampshire","New Jersey"]

In [11]:
frame2.reindex(columns=states)

Unnamed: 0,New York,New Hampshire,New Jersey
a,,,
b,,,
c,,,
d,,,


In [12]:
frame.reindex(columns=states)

Unnamed: 0,New York,New Hampshire,New Jersey
a,,,
c,,,
d,,,


In [13]:
obj = pd.Series(np.arange(5.), index=["a","b","c","d","e"])

In [14]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [15]:
new_obj = obj.drop("c")

In [16]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [17]:
obj.drop(["d","b"])

a    0.0
c    2.0
e    4.0
dtype: float64

In [18]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index = ["Ohio","Colorado","Utah","New York"],
                   columns = ["One","Two","Three","Four"])

In [19]:
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [20]:
data.drop("Two",axis=1)

Unnamed: 0,One,Three,Four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [21]:
data.drop(["One","Three"], axis="columns")

Unnamed: 0,Two,Four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [22]:
obj.drop("c",inplace=True)

In [23]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [24]:
# inplace destroys any data that is dropped by this operation.

In [25]:
# let us create a dataset so that we can subset/loc/iloc data

In [26]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index = ["Ohio", "New York", "New Jersey", "Massachusetts"],
                   columns = ["one","two","three","four"])

In [27]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
New York,4,5,6,7
New Jersey,8,9,10,11
Massachusetts,12,13,14,15


In [28]:
data["two"]

Ohio              1
New York          5
New Jersey        9
Massachusetts    13
Name: two, dtype: int64

In [29]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
New York,4,5,6,7


In [30]:
data[["three","two"]]

Unnamed: 0,three,two
Ohio,2,1
New York,6,5
New Jersey,10,9
Massachusetts,14,13


In [31]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
New York,4,5,6,7
New Jersey,8,9,10,11
Massachusetts,12,13,14,15


In [32]:
# Say we have data on the year-on-year growth rate of spending in four states. We manually create the dataset as follows:

In [33]:
states_growth = pd.DataFrame(np.random.randn(4,3), columns=["2014","2015","2016"],
                            index=["Utah","Oregon","Wisconsin","Ohio"])

In [34]:
states_growth

Unnamed: 0,2014,2015,2016
Utah,-1.546371,0.519999,-1.68205
Oregon,1.336696,0.974357,0.651308
Wisconsin,-0.211011,-0.79654,-0.756254
Ohio,1.245458,-0.323144,0.498492


In [35]:
np.abs(states_growth)  # Say we are only interested in the magnitude and NOT the sign (for some reason)

Unnamed: 0,2014,2015,2016
Utah,1.546371,0.519999,1.68205
Oregon,1.336696,0.974357,0.651308
Wisconsin,0.211011,0.79654,0.756254
Ohio,1.245458,0.323144,0.498492


In [36]:
# Sometimes it is desired to apply operations to a one-dimensional array. In such cases, apply function kicks in:

In [37]:
f = lambda x: x.max() - x.min()  # remember lambda x: takes x as the argument

In [38]:
states_growth.apply(f)

2014    2.883067
2015    1.770897
2016    2.333358
dtype: float64

In [39]:
# It is then reasonable to understand the apply(f) in this case as an average measure of the magnitude of growth/decline. Is it useful? Who knows. But at least we have a reasonable interpretation of the statistic.

In [40]:
states_growth.iloc[[1,2]]  # selecting by rows: 1st (Oregon; remember Python starts counting from 0) and 2nd (Wisconsin)

Unnamed: 0,2014,2015,2016
Oregon,1.336696,0.974357,0.651308
Wisconsin,-0.211011,-0.79654,-0.756254


In [41]:
states_growth.iloc[[0,1],[2,0,1]]

Unnamed: 0,2016,2014,2015
Utah,-1.68205,-1.546371,0.519999
Oregon,0.651308,1.336696,0.974357


In [42]:
states_growth.loc[:"Oregon","2015"]

Utah      0.519999
Oregon    0.974357
Name: 2015, dtype: float64

In [43]:
states_growth.iloc[:,:2][states_growth["2016"] > 0.5]  # select the first two columns first; then, look for the states which achieved more than 0.5% growth in 2016.

Unnamed: 0,2014,2015
Oregon,1.336696,0.974357


In [44]:
states_growth.columns.tolist() # this is doable, though the motivation for doing so is unclear.

['2014', '2015', '2016']

In [45]:
states_growth.loc[:,"2016"]

Utah        -1.682050
Oregon       0.651308
Wisconsin   -0.756254
Ohio         0.498492
Name: 2016, dtype: float64

In [46]:
# We can also write functions and apply it termwise to the dataframe.

In [47]:
def f(x):
    return pd.Series([x.min(),x.max()],index=["min","max"])

In [48]:
states_growth.apply(f)

Unnamed: 0,2014,2015,2016
min,-1.546371,-0.79654,-1.68205
max,1.336696,0.974357,0.651308


In [49]:
# A similar applymap() function exists as well: this allows us to keep formatted string from each floating-point value in frame.  

In [50]:
format = lambda x: "%.2f" % x

In [51]:
states_growth.applymap(format) # Doing this turns EVERY entry into a string -- at which point 

Unnamed: 0,2014,2015,2016
Utah,-1.55,0.52,-1.68
Oregon,1.34,0.97,0.65
Wisconsin,-0.21,-0.8,-0.76
Ohio,1.25,-0.32,0.5


In [52]:
# If I'm only interested in 2015, I can do the following:

In [53]:
states_growth["2015"].map(format)

Utah          0.52
Oregon        0.97
Wisconsin    -0.80
Ohio         -0.32
Name: 2015, dtype: object

In [54]:
# And notice that applymap() automatically takes ["2014","2015","2016"] as the input -- then apply the map function!

In [55]:
# Remember that we can use the sorted() function in Python lists. In pandas there is another sort: by index.

In [56]:
states_growth.sort_index()

Unnamed: 0,2014,2015,2016
Ohio,1.245458,-0.323144,0.498492
Oregon,1.336696,0.974357,0.651308
Utah,-1.546371,0.519999,-1.68205
Wisconsin,-0.211011,-0.79654,-0.756254


In [57]:
# which organises by states in alphabetical (lexicographic) order.

In [58]:
# The axis = argument allows us to set sort by ROW or COLUMN.

In [59]:
states_growth = np.abs(states_growth)

In [60]:
states_growth = states_growth.applymap(format)

In [61]:
states_growth.sort_index(axis=1, ascending = True)

Unnamed: 0,2014,2015,2016
Utah,1.55,0.52,1.68
Oregon,1.34,0.97,0.65
Wisconsin,0.21,0.8,0.76
Ohio,1.25,0.32,0.5


In [62]:
states_growth.sort_values(by = "2015")

Unnamed: 0,2014,2015,2016
Ohio,1.25,0.32,0.5
Utah,1.55,0.52,1.68
Wisconsin,0.21,0.8,0.76
Oregon,1.34,0.97,0.65


In [63]:
# We can also assign rank to data entries.

In [64]:
sample = pd.Series([2,1,2,4,5,-5,-2,-7])

In [65]:
sample.rank()

0    5.5
1    4.0
2    5.5
3    7.0
4    8.0
5    2.0
6    3.0
7    1.0
dtype: float64

In [66]:
sample_rank = sample.rank(method="first")
# sample_rank.reindex(["place","rank"])

In [67]:
# as a tie-breaker for .5 values in the previous output!
type(sample)

pandas.core.series.Series

In [68]:
sample = pd.DataFrame({
    "number": sample,
    "rank": sample_rank},
    )
sample

Unnamed: 0,number,rank
0,2,5.0
1,1,4.0
2,2,6.0
3,4,7.0
4,5,8.0
5,-5,2.0
6,-2,3.0
7,-7,1.0


In [69]:
sample["rank"]

0    5.0
1    4.0
2    6.0
3    7.0
4    8.0
5    2.0
6    3.0
7    1.0
Name: rank, dtype: float64

In [70]:
sample.sort_values(by="rank")

Unnamed: 0,number,rank
7,-7,1.0
5,-5,2.0
6,-2,3.0
1,1,4.0
0,2,5.0
2,2,6.0
3,4,7.0
4,5,8.0


In [71]:
# sometimes index can be nonunique. the situation this presents is the following:

In [72]:
obj = pd.Series(range(6),index=["a","a","b","b","b","c"])

In [73]:
obj

a    0
a    1
b    2
b    3
b    4
c    5
dtype: int64

In [74]:
obj.index.is_unique

False

In [75]:
df = pd.DataFrame(np.random.randn(5,3), index=["a","a","a","b","b"])
df

Unnamed: 0,0,1,2
a,-0.684031,-0.37091,-0.128515
a,0.108311,-0.263303,-0.154045
a,-1.452716,1.032209,0.716566
b,-0.965295,-0.977148,0.416253
b,-0.474641,-0.133172,-0.796661


In [76]:
df.loc["a"]

Unnamed: 0,0,1,2
a,-0.684031,-0.37091,-0.128515
a,0.108311,-0.263303,-0.154045
a,-1.452716,1.032209,0.716566


In [77]:
# descriptive stats:

In [78]:
sample["number"].describe()

count    8.00000
mean     0.00000
std      4.27618
min     -7.00000
25%     -2.75000
50%      1.50000
75%      2.50000
max      5.00000
Name: number, dtype: float64

In [79]:
sample["number"].idxmax()

4

In [80]:
sample["number"].cumsum()

0     2
1     3
2     5
3     9
4    14
5     9
6     7
7     0
Name: number, dtype: int64

In [81]:
sample["number"].cumprod()

0       2
1       2
2       4
3      16
4      80
5    -400
6     800
7   -5600
Name: number, dtype: int64

In [82]:
sample["number"].skew()

-0.6577132125188568

In [83]:
sample["number"].kurt()

-0.8435546875000002

In [84]:
sample["number"].pct_change()

0     NaN
1   -0.50
2    1.00
3    1.00
4    0.25
5   -2.00
6   -0.60
7    2.50
Name: number, dtype: float64

In [85]:
# Membership is a boolean type object!

In [86]:
trial = pd.Series(["a","a","a","a","b","c","d","d"])

In [87]:
mask = trial.isin(["b","d"])

In [88]:
mask

0    False
1    False
2    False
3    False
4     True
5    False
6     True
7     True
dtype: bool

In [89]:
# the matching game

In [90]:
to_match= pd.Series(["b","c","b","a","b","c"])

In [91]:
unique_vals = pd.Series(["b","c"])

In [92]:
pd.Index(unique_vals).get_indexer(to_match)

array([ 0,  1,  0, -1,  0,  1])