In [1]:
import string
from typing import List
import logging
import numpy as np
import pandas as pd

In [2]:
#This is how we interact with the data in a pandas object: a DataFrame or a Series. Let's look at some of the
#more commonly-used pandas methods.
#1) Reindexing: crease a new object, realigning the values to match the new index
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d','b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
#Let's rearrange the data according to a new index in alphabetical order
#if we are using some types of data aka time series we might want to fill in some empty values.

obj2 = obj.reindex(['a','b', 'c', 'd', 'e'])
print(obj2)
print(f'The value of obj2["e"] is {np.isnan(obj2["e"])}')

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
The value of obj2["e"] is True


In [4]:
#if we are using some types of data aka time series we might want to fill in some empty values.
#Here we use ffill in this same manner. We forward fill blank values
obj3 = pd.Series(['blue', 'yellow', 'purple'], index=[0, 2, 4])
obj3

0      blue
2    yellow
4    purple
dtype: object

In [5]:
#Let's replace the index for obj3 with an np.arange index which will create some missing values
obj3.reindex(np.arange(6), method='ffill')

0      blue
1      blue
2    yellow
3    yellow
4    purple
5    purple
dtype: object

In [6]:
#Let's replace the index for obj3 with an np.arange index which will create some missing values
obj3.reindex(np.arange(6), method='bfill')

0      blue
1    yellow
2    yellow
3    purple
4    purple
5       NaN
dtype: object

In [7]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9)
frame

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [8]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9).reshape(3, 3) #no parenthesis inside bracket
frame

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [9]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9).reshape((3, 3)) #same with parenthesis inside the brackets
frame

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [10]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9).reshape(*(3, 3)) #same as the above two
frame

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [11]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = pd.DataFrame(np.arange(9).reshape(3, 3)) #no parenthesis inside bracket
frame

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [12]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
#here we pass it an index for the rows
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                    index=['a', 'c', 'd']) #no parenthesis inside bracket
frame

Unnamed: 0,0,1,2
a,0,1,2
c,3,4,5
d,6,7,8


In [13]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
#here we pass it an index for the columns
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                    columns=['Ohio', 'Texas', 'California']) #no parenthesis inside bracket
frame

Unnamed: 0,Ohio,Texas,California
0,0,1,2
1,3,4,5
2,6,7,8


In [14]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
#here we pass it an index for the columns
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                     index=['a', 'c', 'd'],
                    columns=['Ohio', 'Texas', 'California']) #no parenthesis inside bracket
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [15]:
#now let's reindex frame
frame2 = frame.reindex(index=['a', 'b', 'c', 'd'])
frame2
#notice how we added a row of nans in the DataFrame

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [16]:
ascii_index0 : List[str] = [letter for letter in string.ascii_lowercase[:4]]
frame2 = frame.reindex(index=ascii_index0)
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [17]:
#we can reindex columns using the columns keyword
states : List[str] = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
#Ohio is dropped because it is not in the list of states

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [18]:
#we can also reindex by passing a sequence as a positional argument, and then giving a specific axis.
#axis=0 or axis='rows' for rows and axis=1 or axis='columns' for columns
frame.reindex(states, axis='columns')

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [19]:
frame.reindex(states, axis=1) #same as above

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [20]:
#We can reindex using the df.loc[] operator. The .loc operator cannot create new
#rows or columns.
frame.loc[['a', 'd', 'c']]

Unnamed: 0,Ohio,Texas,California
a,0,1,2
d,6,7,8
c,3,4,5


In [21]:
#We can reindex using the df.loc[] operator. The .loc operator cannot create new
#rows or columns.
frame.loc[['a', 'd', 'c'], ['California', 'Texas']]

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


In [22]:
frame.loc['a']

Ohio          0
Texas         1
California    2
Name: a, dtype: int64

In [23]:
frame.loc[['a', 'd']]

Unnamed: 0,Ohio,Texas,California
a,0,1,2
d,6,7,8


In [24]:
frame.loc[['a', 'd'], ['Ohio', 'California']]

Unnamed: 0,Ohio,California
a,0,2
d,6,8


In [25]:
#dropping entries from axis
#we can use df.drop() to drop entries from an axis without needing to use reindex or .loc
obj = pd.Series(np.arange(5.))
obj

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [26]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [27]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [28]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [29]:
#DataFrame indexes: we can delete a value from either axis
np.arange(16) #1d numpy array 16x1

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [30]:
#DataFrame indexes: we can delete a value from either axis
np.arange(16).reshape((4, 4)) #two dimensional numpy array 4x4

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [31]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4))) #Pandas DataFrame with default indices

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [32]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4)),
            columns=['one', 'two', 'three', 'four']) #Columns

Unnamed: 0,one,two,three,four
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [33]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4)),
            index=['Ohio', 'Colorado', 'Utah', 'New York']) #Columns

Unnamed: 0,0,1,2,3
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4)),
             columns=['one', 'two', 'three', 'four'],
            index=['Ohio', 'Colorado', 'Utah', 'New York']) #Columns

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [35]:
#DataFrame indexes: we can delete a value from either axis
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
             columns=['one', 'two', 'three', 'four'],
            index=['Ohio', 'Colorado', 'Utah', 'New York']) #Columns

In [36]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [38]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [39]:
data[['two', 'one']]

Unnamed: 0,two,one
Ohio,1,0
Colorado,5,4
Utah,9,8
New York,13,12


In [40]:
data[['three', 'two']]

Unnamed: 0,three,two
Ohio,2,1
Colorado,6,5
Utah,10,9
New York,14,13


In [41]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [42]:
data.loc[['Colorado'], ['four']] #works with .loc

Unnamed: 0,four
Colorado,7


In [43]:
try:
    data[['Colorado'], ['four']] #doesn't work
except Exception as e:
    print(e)

(['Colorado'], ['four'])


In [44]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [45]:
data[1:3] #row selection syntax is a convenience

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [46]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [47]:
data2 = data.copy()
data3 = data2.reset_index()

In [48]:
data3.rename(columns={'index': 'state'})

Unnamed: 0,state,one,two,three,four
0,Ohio,0,1,2,3
1,Colorado,4,5,6,7
2,Utah,8,9,10,11
3,New York,12,13,14,15


In [49]:
#indexing a boolean DataFrame
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [50]:
#we can use this DataFrame to assign a value to each location with the value "True"
data[data < 5] = 0

In [51]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [52]:
#pd.DataFrame: Has special .loc and .iloc attributes
#.loc : (axis labels)
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [53]:
data.loc['Colorado']

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [54]:
try:
    data.loc['Colorado', 'New York']
except KeyError as e:
    logging.critical(e)

CRITICAL:root:'New York'


In [55]:
#pass a sequence of labels to select multiple rows
data.loc["Colorado"] #1) entire row selected

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [56]:
#pass a sequence of labels to select multiple rows
data.loc['Colorado', 'two'] #1) single element selected

5

In [57]:
data.loc['Colorado', ['two']] #single element selected with index

two    5
Name: Colorado, dtype: int64

In [58]:
#pattern is df.loc[List[str], str] -> pd.Series
data.loc[['Colorado'], 'two']

Colorado    5
Name: two, dtype: int64

In [59]:
#pattern is df.loc[List[str], List[str]]
data.loc[['Colorado'], ['two']]

Unnamed: 0,two
Colorado,5


In [60]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [61]:
data.loc[['Ohio', 'New York']] #select two index items

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
New York,12,13,14,15


In [62]:
data.loc[['Ohio', 'New York'], 'one'] #select one column with two index items

Ohio         0
New York    12
Name: one, dtype: int64

In [63]:
#iloc: perform integer selections, regardless of the dtype of the index
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [64]:
data.iloc[[2, 1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [65]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [66]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [67]:
data.iloc[[2, 1], [3, 0, 1]]

Unnamed: 0,four,one,two
Utah,11,8,9
Colorado,7,0,5


In [68]:
#loc and iloc work with slices
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [69]:
data.loc[:'Colorado', 'three']

Ohio        0
Colorado    6
Name: three, dtype: int64

In [70]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [71]:
data.iloc[:, :2][data.three > 6]

Unnamed: 0,one,two
Utah,8,9
New York,12,13


In [72]:
#boolean arrays work only with loc not iloc
data.loc[data.three >= 2]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [73]:
#boolean arrays work only with loc not iloc
data.loc[data.one > 0]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [74]:
#indexing options:
#1) df[column]
new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)))
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.015234,0.614680,0.094119,0.724865,0.252346,0.681320,0.785545,0.676769,0.596373,0.016578
1,0.335416,0.515941,0.186942,0.955771,0.664718,0.112681,0.196694,0.317194,0.348991,0.300739
2,0.218733,0.125230,0.764109,0.504150,0.929893,0.831557,0.879268,0.358746,0.643843,0.074766
3,0.200675,0.078089,0.570345,0.316122,0.275168,0.316959,0.108317,0.860687,0.370462,0.408807
4,0.123360,0.291419,0.439563,0.006244,0.113255,0.056500,0.382728,0.726872,0.471981,0.067688
...,...,...,...,...,...,...,...,...,...,...
95,0.934398,0.718260,0.306785,0.210732,0.764171,0.026424,0.206805,0.887879,0.650235,0.667357
96,0.689794,0.168788,0.216082,0.100747,0.646257,0.946168,0.322470,0.154298,0.898130,0.727247
97,0.603558,0.836171,0.974774,0.688296,0.056819,0.054225,0.908624,0.329671,0.622767,0.303808
98,0.664285,0.359747,0.638044,0.598179,0.728643,0.541945,0.375180,0.421264,0.714171,0.550802


In [75]:
#indexing options sample indexes
states_index = pd.Index(['New Jersey', 'New Mexico', 'Mississippi', 'Hawaii', 'Wyoming', 'Iowa',
       'Virginia', 'Arkansas', 'Alaska', 'Florida', 'Idaho', 'Iowa',
       'Colorado', 'Oregon', 'Wyoming', 'Washington', 'Oregon', 'Connecticut',
       'Nevada', 'Massachusetts', 'Arkansas', 'Arkansas', 'Vermont',
       'New Hampshire', 'Colorado', 'Iowa', 'Maine', 'Utah', 'Kentucky',
       'Alabama', 'Missouri', 'Connecticut', 'Ohio', 'Tennessee', 'Oregon',
       'Colorado', 'Kansas', 'Alabama', 'New Hampshire', 'Arkansas', 'Idaho',
       'Montana', 'South Carolina', 'Nebraska', 'Rhode Island', 'Tennessee',
       'Alaska', 'Louisiana', 'Utah', 'Oregon', 'Idaho', 'Virginia',
       'Louisiana', 'Colorado', 'Oregon', 'Massachusetts', 'North Carolina',
       'South Dakota', 'Alabama', 'Rhode Island', 'Oregon', 'Utah', 'Alaska',
       'Alabama', 'New Mexico', 'New York', 'Illinois', 'Idaho', 'Connecticut',
       'Nebraska', 'Nebraska', 'Arkansas', 'North Carolina', 'West Virginia',
       'Arkansas', 'Texas', 'Maine', 'Maryland', 'Virginia', 'Nebraska',
       'North Carolina', 'Wyoming', 'Iowa', 'Connecticut', 'Arizona',
       'Arizona', 'Tennessee', 'Alabama', 'Nebraska', 'Oklahoma', 'Arizona',
       'Rhode Island', 'Florida', 'Montana', 'Vermont', 'Minnesota',
       'Kentucky', 'Arizona', 'Rhode Island', 'Texas'])
columnNames = pd.Index(['get', 'only', 'chair', 'physical', 'late', 'TV', 'body', 'meeting',
       'present', 'left'])

In [76]:
#indexing options:
#1) df[column]
new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)),
                     index=states_index)
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
New Jersey,0.089962,0.253493,0.560304,0.998534,0.351585,0.531570,0.722990,0.167673,0.040699,0.552183
New Mexico,0.890930,0.438101,0.305986,0.905596,0.679948,0.601332,0.428960,0.251604,0.632793,0.711301
Mississippi,0.431548,0.563409,0.579122,0.385428,0.450794,0.857920,0.652428,0.295824,0.312362,0.666154
Hawaii,0.457274,0.453423,0.527059,0.408607,0.162598,0.132851,0.997274,0.105518,0.056441,0.778627
Wyoming,0.730493,0.891542,0.732029,0.672168,0.366740,0.098111,0.421794,0.389237,0.356621,0.611456
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.675578,0.907319,0.152148,0.577032,0.129874,0.201901,0.297193,0.977492,0.360527,0.965855
Kentucky,0.511570,0.926413,0.773386,0.226022,0.205440,0.589056,0.006523,0.915199,0.477279,0.190170
Arizona,0.260930,0.213917,0.595806,0.310035,0.776801,0.655326,0.634777,0.389874,0.634821,0.407561
Rhode Island,0.567139,0.979703,0.074262,0.629006,0.599250,0.289242,0.942496,0.344496,0.802657,0.708872


In [77]:
#indexing options:
#1) df[column]

new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)),
                     index=states_index,
                     columns=columnNames)
new_df

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
New Jersey,0.456886,0.256962,0.902719,0.588741,0.955217,0.154738,0.986542,0.093513,0.246107,0.192697
New Mexico,0.921158,0.568884,0.443063,0.428384,0.082564,0.874698,0.637541,0.524961,0.421273,0.469845
Mississippi,0.167868,0.283899,0.840879,0.681860,0.136192,0.181402,0.466912,0.365477,0.616962,0.394703
Hawaii,0.352177,0.170223,0.084919,0.613104,0.407250,0.966276,0.770265,0.400759,0.975953,0.902678
Wyoming,0.805518,0.010542,0.187317,0.071196,0.604164,0.140993,0.750671,0.281944,0.402476,0.697836
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.580978,0.199826,0.405198,0.019082,0.309861,0.695654,0.003341,0.358154,0.128086,0.597393
Kentucky,0.907892,0.835342,0.217811,0.991489,0.051192,0.626988,0.033630,0.378730,0.335739,0.131851
Arizona,0.429982,0.373175,0.612522,0.263103,0.461718,0.330648,0.690474,0.270138,0.723792,0.966059
Rhode Island,0.101214,0.673243,0.045420,0.902907,0.171952,0.177155,0.905293,0.073142,0.729249,0.440137


In [78]:
#indexing options:
#1) df[column]
new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)),
                     index=states_index,
                     columns=columnNames)
new_df.reindex(sorted(columnNames), axis=1)

Unnamed: 0,TV,body,chair,get,late,left,meeting,only,physical,present
New Jersey,0.891934,0.857982,0.695081,0.657985,0.976793,0.356825,0.561040,0.982943,0.212636,0.855713
New Mexico,0.004083,0.415173,0.742531,0.537690,0.685327,0.409860,0.498082,0.859793,0.386159,0.197923
Mississippi,0.094022,0.437172,0.724468,0.493269,0.619632,0.733510,0.634802,0.240760,0.736518,0.526760
Hawaii,0.665872,0.010202,0.369409,0.165724,0.403677,0.770041,0.694037,0.892010,0.912080,0.591518
Wyoming,0.683302,0.576557,0.044858,0.234081,0.084439,0.270449,0.114205,0.863667,0.170244,0.374354
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.874646,0.767367,0.536507,0.413960,0.317540,0.385611,0.154874,0.598265,0.061944,0.667251
Kentucky,0.737247,0.966442,0.579385,0.810519,0.313185,0.417742,0.633720,0.625720,0.692504,0.049450
Arizona,0.269428,0.710996,0.145045,0.918427,0.624265,0.925460,0.527895,0.212745,0.036845,0.729117
Rhode Island,0.025934,0.697926,0.740020,0.616330,0.943233,0.078846,0.385350,0.145199,0.684049,0.344951


In [79]:
new_index = columnNames.sort_values() #we can call the sort_values method of the pd.Index object
new_df.reindex(new_index, axis=1)

Unnamed: 0,TV,body,chair,get,late,left,meeting,only,physical,present
New Jersey,0.891934,0.857982,0.695081,0.657985,0.976793,0.356825,0.561040,0.982943,0.212636,0.855713
New Mexico,0.004083,0.415173,0.742531,0.537690,0.685327,0.409860,0.498082,0.859793,0.386159,0.197923
Mississippi,0.094022,0.437172,0.724468,0.493269,0.619632,0.733510,0.634802,0.240760,0.736518,0.526760
Hawaii,0.665872,0.010202,0.369409,0.165724,0.403677,0.770041,0.694037,0.892010,0.912080,0.591518
Wyoming,0.683302,0.576557,0.044858,0.234081,0.084439,0.270449,0.114205,0.863667,0.170244,0.374354
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.874646,0.767367,0.536507,0.413960,0.317540,0.385611,0.154874,0.598265,0.061944,0.667251
Kentucky,0.737247,0.966442,0.579385,0.810519,0.313185,0.417742,0.633720,0.625720,0.692504,0.049450
Arizona,0.269428,0.710996,0.145045,0.918427,0.624265,0.925460,0.527895,0.212745,0.036845,0.729117
Rhode Island,0.025934,0.697926,0.740020,0.616330,0.943233,0.078846,0.385350,0.145199,0.684049,0.344951


In [80]:
new_df.loc['Montana']

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Montana,0.250366,0.318648,0.861528,0.18879,0.712968,0.25154,0.355893,0.57956,0.105577,0.584092
Montana,0.682804,0.903597,0.890277,0.906357,0.745009,0.124435,0.229015,0.077205,0.697047,0.061864


In [81]:
new_df.index

Index(['New Jersey', 'New Mexico', 'Mississippi', 'Hawaii', 'Wyoming', 'Iowa',
       'Virginia', 'Arkansas', 'Alaska', 'Florida', 'Idaho', 'Iowa',
       'Colorado', 'Oregon', 'Wyoming', 'Washington', 'Oregon', 'Connecticut',
       'Nevada', 'Massachusetts', 'Arkansas', 'Arkansas', 'Vermont',
       'New Hampshire', 'Colorado', 'Iowa', 'Maine', 'Utah', 'Kentucky',
       'Alabama', 'Missouri', 'Connecticut', 'Ohio', 'Tennessee', 'Oregon',
       'Colorado', 'Kansas', 'Alabama', 'New Hampshire', 'Arkansas', 'Idaho',
       'Montana', 'South Carolina', 'Nebraska', 'Rhode Island', 'Tennessee',
       'Alaska', 'Louisiana', 'Utah', 'Oregon', 'Idaho', 'Virginia',
       'Louisiana', 'Colorado', 'Oregon', 'Massachusetts', 'North Carolina',
       'South Dakota', 'Alabama', 'Rhode Island', 'Oregon', 'Utah', 'Alaska',
       'Alabama', 'New Mexico', 'New York', 'Illinois', 'Idaho', 'Connecticut',
       'Nebraska', 'Nebraska', 'Arkansas', 'North Carolina', 'West Virginia',
       'Arkansas',

In [82]:
#df[column]
new_df['get']

New Jersey      0.657985
New Mexico      0.537690
Mississippi     0.493269
Hawaii          0.165724
Wyoming         0.234081
                  ...   
Minnesota       0.413960
Kentucky        0.810519
Arizona         0.918427
Rhode Island    0.616330
Texas           0.804623
Name: get, Length: 100, dtype: float64

In [83]:
new_df.loc[['Montana'], 'chair']

Montana    0.861528
Montana    0.890277
Name: chair, dtype: float64

In [84]:
new_df.loc[['Montana'], ['body']] #using a sequence creates a DataFrame rather than a Series

Unnamed: 0,body
Montana,0.355893
Montana,0.229015


In [85]:
new_df.loc['Montana', ['physical']] #converting the first part into a string rather than a single-item list produces
#the same result

Unnamed: 0,physical
Montana,0.18879
Montana,0.906357


In [86]:
#df.loc[rows] select single row or subset of rows from the DataFrame by label
new_df.loc['Montana']

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Montana,0.250366,0.318648,0.861528,0.18879,0.712968,0.25154,0.355893,0.57956,0.105577,0.584092
Montana,0.682804,0.903597,0.890277,0.906357,0.745009,0.124435,0.229015,0.077205,0.697047,0.061864


In [87]:
#df.loc[rows] select single row or subset of rows from the DataFrame by label
new_df.loc[['Montana', 'Virginia']]

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Montana,0.250366,0.318648,0.861528,0.18879,0.712968,0.25154,0.355893,0.57956,0.105577,0.584092
Montana,0.682804,0.903597,0.890277,0.906357,0.745009,0.124435,0.229015,0.077205,0.697047,0.061864
Virginia,0.196828,0.494246,0.28443,0.610643,0.148994,0.590165,0.719782,0.685751,0.103747,0.467951
Virginia,0.472613,0.891464,0.640033,0.350344,0.177277,0.819383,0.544063,0.924594,0.118953,0.165664
Virginia,0.791953,0.765742,0.980966,0.553481,0.767909,0.03152,0.492839,0.332497,0.331979,0.681911


In [88]:
#df.loc[rows, cols] : select both rows and columns by label
new_df.loc['Rhode Island', 'only']

Rhode Island    0.895712
Rhode Island    0.906608
Rhode Island    0.857795
Rhode Island    0.145199
Name: only, dtype: float64

In [89]:
#df.loc[rows, cols]
new_df.loc['Rhode Island', ['only', 'present']] 

Unnamed: 0,only,present
Rhode Island,0.895712,0.538717
Rhode Island,0.906608,0.934779
Rhode Island,0.857795,0.528081
Rhode Island,0.145199,0.344951


In [90]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island'], ['only', 'present']] #same as above

Unnamed: 0,only,present
Rhode Island,0.895712,0.538717
Rhode Island,0.906608,0.934779
Rhode Island,0.857795,0.528081
Rhode Island,0.145199,0.344951


In [91]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island', 'Virginia'], ['only']]

Unnamed: 0,only
Rhode Island,0.895712
Rhode Island,0.906608
Rhode Island,0.857795
Rhode Island,0.145199
Virginia,0.494246
Virginia,0.891464
Virginia,0.765742


In [92]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island', 'Virginia'], 'only']

Rhode Island    0.895712
Rhode Island    0.906608
Rhode Island    0.857795
Rhode Island    0.145199
Virginia        0.494246
Virginia        0.891464
Virginia        0.765742
Name: only, dtype: float64

In [93]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island', 'Virginia'], ['physical', 'present']]

Unnamed: 0,physical,present
Rhode Island,0.335112,0.538717
Rhode Island,0.968646,0.934779
Rhode Island,0.881872,0.528081
Rhode Island,0.684049,0.344951
Virginia,0.610643,0.103747
Virginia,0.350344,0.118953
Virginia,0.553481,0.331979


In [94]:
#df.iloc[rows] 
new_df.iloc[0]

get         0.657985
only        0.982943
chair       0.695081
physical    0.212636
late        0.976793
TV          0.891934
body        0.857982
meeting     0.561040
present     0.855713
left        0.356825
Name: New Jersey, dtype: float64

In [95]:
new_df.iloc[2:]

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Mississippi,0.493269,0.240760,0.724468,0.736518,0.619632,0.094022,0.437172,0.634802,0.526760,0.733510
Hawaii,0.165724,0.892010,0.369409,0.912080,0.403677,0.665872,0.010202,0.694037,0.591518,0.770041
Wyoming,0.234081,0.863667,0.044858,0.170244,0.084439,0.683302,0.576557,0.114205,0.374354,0.270449
Iowa,0.980859,0.850547,0.183592,0.100376,0.283960,0.071804,0.725892,0.929272,0.181433,0.886533
Virginia,0.196828,0.494246,0.284430,0.610643,0.148994,0.590165,0.719782,0.685751,0.103747,0.467951
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.413960,0.598265,0.536507,0.061944,0.317540,0.874646,0.767367,0.154874,0.667251,0.385611
Kentucky,0.810519,0.625720,0.579385,0.692504,0.313185,0.737247,0.966442,0.633720,0.049450,0.417742
Arizona,0.918427,0.212745,0.145045,0.036845,0.624265,0.269428,0.710996,0.527895,0.729117,0.925460
Rhode Island,0.616330,0.145199,0.740020,0.684049,0.943233,0.025934,0.697926,0.385350,0.344951,0.078846


In [96]:
#df.iloc[rows]
new_df.iloc[25:29]

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Iowa,0.142096,0.133166,0.335752,0.492786,0.314069,0.340812,0.862644,0.98984,0.449121,0.442443
Maine,0.082513,0.808971,0.807678,0.878597,0.544952,0.658202,0.986306,0.120496,0.678237,0.186978
Utah,0.139938,0.915125,0.871024,0.821246,0.3034,0.571518,0.446783,0.81939,0.896324,0.407153
Kentucky,0.653018,0.50902,0.08412,0.189963,0.625314,0.498573,0.363762,0.718204,0.266353,0.555758


In [97]:
#df.iloc[:, cols]
new_df.iloc[:, 1]

New Jersey      0.982943
New Mexico      0.859793
Mississippi     0.240760
Hawaii          0.892010
Wyoming         0.863667
                  ...   
Minnesota       0.598265
Kentucky        0.625720
Arizona         0.212745
Rhode Island    0.145199
Texas           0.130347
Name: only, Length: 100, dtype: float64

In [98]:
#new_df.iloc[:, columns]
new_df.iloc[:, -1]

New Jersey      0.356825
New Mexico      0.409860
Mississippi     0.733510
Hawaii          0.770041
Wyoming         0.270449
                  ...   
Minnesota       0.385611
Kentucky        0.417742
Arizona         0.925460
Rhode Island    0.078846
Texas           0.149296
Name: left, Length: 100, dtype: float64

In [99]:
#new_df.iloc[:, columns]
new_df.iloc[:, -2:]

Unnamed: 0,present,left
New Jersey,0.855713,0.356825
New Mexico,0.197923,0.409860
Mississippi,0.526760,0.733510
Hawaii,0.591518,0.770041
Wyoming,0.374354,0.270449
...,...,...
Minnesota,0.667251,0.385611
Kentucky,0.049450,0.417742
Arizona,0.729117,0.925460
Rhode Island,0.344951,0.078846


In [100]:
#df.iloc[rows, cols]
new_df.iloc[2, 4] #one item as a scalar

0.619631802803102

In [101]:
#pattern is df.iloc[int, List[int]]
new_df.iloc[2, [4]] #displays the index when we put the second item in a list

late    0.619632
Name: Mississippi, dtype: float64

In [102]:
#pattern is df.iloc[List[int], int]
new_df.iloc[[2], 4] #same as above 

Mississippi    0.619632
Name: late, dtype: float64

In [103]:
#pattern is df.iloc[List[int], List[int]]
new_df.iloc[[2], [4]] #same as above 

Unnamed: 0,late
Mississippi,0.619632


In [104]:
#pattern is df.iloc[List[int, int], List[int]]
new_df.iloc[[2, 3], [4]] #dataframe with two rows one column

Unnamed: 0,late
Mississippi,0.619632
Hawaii,0.403677


In [105]:
new_df.iloc[2, [4, 5]] #two items

late    0.619632
TV      0.094022
Name: Mississippi, dtype: float64

In [106]:
new_df.iloc[[2], [4, 5]] #DataFrame with two columns one row

Unnamed: 0,late,TV
Mississippi,0.619632,0.094022


In [107]:
new_df.iloc[[2, 1], [4, 5]] #DataFrame with two 

Unnamed: 0,late,TV
Mississippi,0.619632,0.094022
New Mexico,0.685327,0.004083


In [108]:
#new_df.iloc[:, columns]
new_df.iloc[:, [3]] #second item as a list makes a DataFrame

Unnamed: 0,physical
New Jersey,0.212636
New Mexico,0.386159
Mississippi,0.736518
Hawaii,0.912080
Wyoming,0.170244
...,...
Minnesota,0.061944
Kentucky,0.692504
Arizona,0.036845
Rhode Island,0.684049


In [109]:
#new_df.iloc[:, columns]
#pattern is new_df.iloc[..., List[int]]
new_df.iloc[..., [3]] #second item as a list makes a DataFrame

Unnamed: 0,physical
New Jersey,0.212636
New Mexico,0.386159
Mississippi,0.736518
Hawaii,0.912080
Wyoming,0.170244
...,...
Minnesota,0.061944
Kentucky,0.692504
Arizona,0.036845
Rhode Island,0.684049


In [110]:
#df.at[row, col] #select a single scalar value by row and column label
new_df.at['New Jersey', 'physical']

0.21263566961045233

In [111]:
#df.at doesn't work with lists, only individual values
new_df.at['New Mexico', 'physical'] #returns more than one row if we have more than onel

New Mexico    0.386159
New Mexico    0.529167
Name: physical, dtype: float64

In [112]:
#df.iat[row, column]
new_df.iat[0, 0]

0.6579846743407672

In [113]:
#df.iat[row, column]
new_df.iat[-1, -1]

0.14929648528180683

In [114]:
#df.reindex method : select either rows or columns by their labels
print('columns', new_df.columns)

columns Index(['get', 'only', 'chair', 'physical', 'late', 'TV', 'body', 'meeting',
       'present', 'left'],
      dtype='object')


In [115]:
new_df.reindex(['get', 'only', 'chair'], axis=1)

Unnamed: 0,get,only,chair
New Jersey,0.657985,0.982943,0.695081
New Mexico,0.537690,0.859793,0.742531
Mississippi,0.493269,0.240760,0.724468
Hawaii,0.165724,0.892010,0.369409
Wyoming,0.234081,0.863667,0.044858
...,...,...,...
Minnesota,0.413960,0.598265,0.536507
Kentucky,0.810519,0.625720,0.579385
Arizona,0.918427,0.212745,0.145045
Rhode Island,0.616330,0.145199,0.740020


In [116]:
new_df.reindex(['get', 'only', 'chair'], axis=1)

Unnamed: 0,get,only,chair
New Jersey,0.657985,0.982943,0.695081
New Mexico,0.537690,0.859793,0.742531
Mississippi,0.493269,0.240760,0.724468
Hawaii,0.165724,0.892010,0.369409
Wyoming,0.234081,0.863667,0.044858
...,...,...,...
Minnesota,0.413960,0.598265,0.536507
Kentucky,0.810519,0.625720,0.579385
Arizona,0.918427,0.212745,0.145045
Rhode Island,0.616330,0.145199,0.740020


In [117]:
first_three_columns = ['get', 'only', 'chair']
new_df.reindex(first_three_columns, axis='columns') #use reindex to effectively drop rows

Unnamed: 0,get,only,chair
New Jersey,0.657985,0.982943,0.695081
New Mexico,0.537690,0.859793,0.742531
Mississippi,0.493269,0.240760,0.724468
Hawaii,0.165724,0.892010,0.369409
Wyoming,0.234081,0.863667,0.044858
...,...,...,...
Minnesota,0.413960,0.598265,0.536507
Kentucky,0.810519,0.625720,0.579385
Arizona,0.918427,0.212745,0.145045
Rhode Island,0.616330,0.145199,0.740020


In [118]:
first_three_columns = ['get', 'only', 'chair']
new_df.reindex(first_three_columns, axis=1)

Unnamed: 0,get,only,chair
New Jersey,0.657985,0.982943,0.695081
New Mexico,0.537690,0.859793,0.742531
Mississippi,0.493269,0.240760,0.724468
Hawaii,0.165724,0.892010,0.369409
Wyoming,0.234081,0.863667,0.044858
...,...,...,...
Minnesota,0.413960,0.598265,0.536507
Kentucky,0.810519,0.625720,0.579385
Arizona,0.918427,0.212745,0.145045
Rhode Island,0.616330,0.145199,0.740020


In [119]:
#if we have an index with duplicate labels, than we cannot reindex
try:
    first_three_states = ['Alabama', 'Alaska', 'Arizona']
    new_df.reindex(first_three_states, axis=0)
except ValueError as e:
    logging.critical(e)

  new_df.reindex(first_three_states, axis=0)
CRITICAL:root:cannot reindex on an axis with duplicate labels


In [120]:
#integer indexing pitfalls and issues: Pandas cannot tell if an Integer Index is label-based
#or position-based. Therefore, Pandas treats integer indices as label-based to be safe
#the code below will raise an error
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [121]:
try:
    ser[-1]
except KeyError as e:
    logging.critical(e)

CRITICAL:root:-1


In [122]:
ser #pandas doesn't want to guess if it is label or integer based indexing

0    0.0
1    1.0
2    2.0
dtype: float64

In [123]:
#noninteger index does not have this ambiguity
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [124]:
#this won't have this problem
ser2[-1]

2.0

In [125]:
#use loc or iloc to index an axis with integers
ser.iloc[-1]

2.0

In [126]:
#slicing with integers always is integer-oriented
ser[:2]
#AVOID AMBIGUITY: Use .loc and .iloc

0    0.0
1    1.0
dtype: float64

In [127]:
#Chained indexing pitfalls:
#1a) view DataFrame before modification
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [128]:
#Note: data['one'] access the column directly with a label.
data['one']

Ohio         0
Colorado     0
Utah         8
New York    12
Name: one, dtype: int64

In [129]:
#1b) View data in its original form.
data.loc[:, 'one']

Ohio         0
Colorado     0
Utah         8
New York    12
Name: one, dtype: int64

In [130]:
#1c) Assign every item in column "one" to the value of 1.
data.loc[:, 'one'] = 1

In [131]:
#1d) View modified data
data #every item in the DataFrame is set to one

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,1,9,10,11
New York,1,13,14,15


In [132]:
#2) Here we set each value in row number 2 (starting from 0, named "Utah") to 5. 
#2a) View data in its original form
data.iloc[2]

one       1
two       9
three    10
four     11
Name: Utah, dtype: int64

In [133]:
#2) Set the value of each item in row 2 (starting from 0, named "Utah") to 5.
#2b) Code that sets the value in place
data.iloc[2] = 5

In [134]:
#2b) Just as a note, we can get the exact same view of column three by using data.loc['Utah'].
data.loc['Utah']

one      5
two      5
three    5
four     5
Name: Utah, dtype: int64

In [135]:
#3a) To illustrate what is happening, let's view the boolean array we use for indexing
data['four'] > 5 #take the column labelled "four" and return a boolean series. True if the row value is
#greater than five, false if it is not. 


Ohio        False
Colorado     True
Utah        False
New York     True
Name: four, dtype: bool

In [136]:
#3b) To further illustrate what is happening, let's get the rows where the value of column 'four' is >= 5
data.loc[data['four'] > 5] #notice we do not get any results from Ohio because data.loc['Ohio', 'Four'] there is less than 5

Unnamed: 0,one,two,three,four
Colorado,1,5,6,7
New York,1,13,14,15


In [137]:
#3cß) Finally, let us use the code which assigns the value 3 to every item in column 4 that is larger than 3.
data.loc[data['four'] > 5] = 3

In [138]:
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [139]:
#some more examples from indexing row 0 (Ohio) using .loc syntax
data.loc['Ohio', ['four', 'three']]

four     0
three    0
Name: Ohio, dtype: int64

In [140]:
data.loc['Ohio', ['three', 'four']]

three    0
four     0
Name: Ohio, dtype: int64

In [141]:
data.loc[['Ohio', 'New York'], 'one']

Ohio        1
New York    3
Name: one, dtype: int64

In [142]:
#let's use iloc
data.iloc[0, [3, 2]]

four     0
three    0
Name: Ohio, dtype: int64

In [143]:
#common gotcha: chaining selections when assigning
data.loc[data.three == 5]['three'] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.three == 5]['three'] = 6


In [144]:
#we are inadvertantly modifying a temporary value
#the data here is not modified
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [145]:
#Solution: Use .loc to create a single operation. Thus no temporary values that need modification.
#1a) Display the whole row
data.loc[data.three == 5]

Unnamed: 0,one,two,three,four
Utah,5,5,5,5


In [146]:
#1b) Display the value of row 2 (indexed from zero) in column three
data.loc[data.three == 5, 'three']

Utah    5
Name: three, dtype: int64

In [147]:
#1c) Set the value of row 2 (indexed from zero) in column three to 3
data.loc[data.three == 5, 'three'] = 6

In [148]:
data #notice we modified the value

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


In [151]:
#1d) Let's modify the value to 7 using .iloc syntax
#a) Copy the initial DataFrame
new_data = data.copy()

In [154]:
#b) #get the value of row 2
new_data.iloc[:, 2]

Ohio        0
Colorado    3
Utah        6
New York    3
Name: three, dtype: int64

In [155]:
new_data.iloc[2, 2]

6

In [156]:
new_data.iloc[2, 2] = 7

In [157]:
new_data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,7,5
New York,3,3,3,3


In [158]:
#Let's try it with lists
new_data.iloc[[2], [2]]

Unnamed: 0,three
Utah,7


In [159]:
#let's try it with each respective column as a list
new_data.iloc[[2], 2] #row name from axis='columns', column name from axis='rows'

Utah    7
Name: three, dtype: int64

In [161]:
#let's try it with each respective column as a list
new_data.iloc[2, [2]]

three    7
Name: Utah, dtype: int64