In [1]:
import string
from typing import List
import logging
import numpy as np
import pandas as pd

In [2]:
#This is how we interact with the data in a pandas object: a DataFrame or a Series. Let's look at some of the
#more commonly-used pandas methods.
#1) Reindexing: crease a new object, realigning the values to match the new index
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d','b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
#Let's rearrange the data according to a new index in alphabetical order
#if we are using some types of data aka time series we might want to fill in some empty values.

obj2 = obj.reindex(['a','b', 'c', 'd', 'e'])
print(obj2)
print(f'The value of obj2["e"] is {np.isnan(obj2["e"])}')

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
The value of obj2["e"] is True


In [4]:
#if we are using some types of data aka time series we might want to fill in some empty values.
#Here we use ffill in this same manner. We forward fill blank values
obj3 = pd.Series(['blue', 'yellow', 'purple'], index=[0, 2, 4])
obj3

0      blue
2    yellow
4    purple
dtype: object

In [5]:
#Let's replace the index for obj3 with an np.arange index which will create some missing values
obj3.reindex(np.arange(6), method='ffill')

0      blue
1      blue
2    yellow
3    yellow
4    purple
5    purple
dtype: object

In [6]:
#Let's replace the index for obj3 with an np.arange index which will create some missing values
obj3.reindex(np.arange(6), method='bfill')

0      blue
1    yellow
2    yellow
3    purple
4    purple
5       NaN
dtype: object

In [7]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9)
frame

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [8]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9).reshape(3, 3) #no parenthesis inside bracket
frame

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [9]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9).reshape((3, 3)) #same with parenthesis inside the brackets
frame

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [10]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = np.arange(9).reshape(*(3, 3)) #same as the above two
frame

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [11]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
frame = pd.DataFrame(np.arange(9).reshape(3, 3)) #no parenthesis inside bracket
frame

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [12]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
#here we pass it an index for the rows
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                    index=['a', 'c', 'd']) #no parenthesis inside bracket
frame

Unnamed: 0,0,1,2
a,0,1,2
c,3,4,5
d,6,7,8


In [13]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
#here we pass it an index for the columns
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                    columns=['Ohio', 'Texas', 'California']) #no parenthesis inside bracket
frame

Unnamed: 0,Ohio,Texas,California
0,0,1,2
1,3,4,5
2,6,7,8


In [14]:
#df.reindex for DataFrames: we can modify the rows and/or the columns. When we pass it
#a sequence, it reindexes the rows
#here we pass it an index for the columns
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                     index=['a', 'c', 'd'],
                    columns=['Ohio', 'Texas', 'California']) #no parenthesis inside bracket
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [15]:
#now let's reindex frame
frame2 = frame.reindex(index=['a', 'b', 'c', 'd'])
frame2
#notice how we added a row of nans in the DataFrame

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [16]:
ascii_index0 : List[str] = [letter for letter in string.ascii_lowercase[:4]]
frame2 = frame.reindex(index=ascii_index0)
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [17]:
#we can reindex columns using the columns keyword
states : List[str] = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
#Ohio is dropped because it is not in the list of states

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [18]:
#we can also reindex by passing a sequence as a positional argument, and then giving a specific axis.
#axis=0 or axis='rows' for rows and axis=1 or axis='columns' for columns
frame.reindex(states, axis='columns')

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [19]:
frame.reindex(states, axis=1) #same as above

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [20]:
#We can reindex using the df.loc[] operator. The .loc operator cannot create new
#rows or columns.
frame.loc[['a', 'd', 'c']]

Unnamed: 0,Ohio,Texas,California
a,0,1,2
d,6,7,8
c,3,4,5


In [21]:
#We can reindex using the df.loc[] operator. The .loc operator cannot create new
#rows or columns.
frame.loc[['a', 'd', 'c'], ['California', 'Texas']]

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


In [22]:
frame.loc['a']

Ohio          0
Texas         1
California    2
Name: a, dtype: int64

In [23]:
frame.loc[['a', 'd']]

Unnamed: 0,Ohio,Texas,California
a,0,1,2
d,6,7,8


In [24]:
frame.loc[['a', 'd'], ['Ohio', 'California']]

Unnamed: 0,Ohio,California
a,0,2
d,6,8


In [25]:
#dropping entries from axis
#we can use df.drop() to drop entries from an axis without needing to use reindex or .loc
obj = pd.Series(np.arange(5.))
obj

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [26]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [27]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [28]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [29]:
#DataFrame indexes: we can delete a value from either axis
np.arange(16) #1d numpy array 16x1

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [30]:
#DataFrame indexes: we can delete a value from either axis
np.arange(16).reshape((4, 4)) #two dimensional numpy array 4x4

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [31]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4))) #Pandas DataFrame with default indices

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [32]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4)),
            columns=['one', 'two', 'three', 'four']) #Columns

Unnamed: 0,one,two,three,four
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [33]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4)),
            index=['Ohio', 'Colorado', 'Utah', 'New York']) #Columns

Unnamed: 0,0,1,2,3
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
#DataFrame indexes: we can delete a value from either axis
pd.DataFrame(np.arange(16).reshape((4, 4)),
             columns=['one', 'two', 'three', 'four'],
            index=['Ohio', 'Colorado', 'Utah', 'New York']) #Columns

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [35]:
#DataFrame indexes: we can delete a value from either axis
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
             columns=['one', 'two', 'three', 'four'],
            index=['Ohio', 'Colorado', 'Utah', 'New York']) #Columns

In [36]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [38]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [39]:
data[['two', 'one']]

Unnamed: 0,two,one
Ohio,1,0
Colorado,5,4
Utah,9,8
New York,13,12


In [40]:
data[['three', 'two']]

Unnamed: 0,three,two
Ohio,2,1
Colorado,6,5
Utah,10,9
New York,14,13


In [41]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [42]:
data.loc[['Colorado'], ['four']] #works with .loc

Unnamed: 0,four
Colorado,7


In [43]:
try:
    data[['Colorado'], ['four']] #doesn't work
except Exception as e:
    print(e)

(['Colorado'], ['four'])


In [44]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [45]:
data[1:3] #row selection syntax is a convenience

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [46]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [47]:
data2 = data.copy()
data3 = data2.reset_index()

In [48]:
data3.rename(columns={'index': 'state'})

Unnamed: 0,state,one,two,three,four
0,Ohio,0,1,2,3
1,Colorado,4,5,6,7
2,Utah,8,9,10,11
3,New York,12,13,14,15


In [49]:
#indexing a boolean DataFrame
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [50]:
#we can use this DataFrame to assign a value to each location with the value "True"
data[data < 5] = 0

In [51]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [52]:
#pd.DataFrame: Has special .loc and .iloc attributes
#.loc : (axis labels)
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [53]:
data.loc['Colorado']

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [54]:
try:
    data.loc['Colorado', 'New York']
except KeyError as e:
    logging.critical(e)

CRITICAL:root:'New York'


In [55]:
#pass a sequence of labels to select multiple rows
data.loc["Colorado"] #1) entire row selected

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [56]:
#pass a sequence of labels to select multiple rows
data.loc['Colorado', 'two'] #1) single element selected

5

In [57]:
data.loc['Colorado', ['two']] #single element selected with index

two    5
Name: Colorado, dtype: int64

In [58]:
#pattern is df.loc[List[str], str] -> pd.Series
data.loc[['Colorado'], 'two']

Colorado    5
Name: two, dtype: int64

In [59]:
#pattern is df.loc[List[str], List[str]]
data.loc[['Colorado'], ['two']]

Unnamed: 0,two
Colorado,5


In [60]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [61]:
data.loc[['Ohio', 'New York']] #select two index items

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
New York,12,13,14,15


In [62]:
data.loc[['Ohio', 'New York'], 'one'] #select one column with two index items

Ohio         0
New York    12
Name: one, dtype: int64

In [63]:
#iloc: perform integer selections, regardless of the dtype of the index
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [64]:
data.iloc[[2, 1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [65]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [66]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [67]:
data.iloc[[2, 1], [3, 0, 1]]

Unnamed: 0,four,one,two
Utah,11,8,9
Colorado,7,0,5


In [68]:
#loc and iloc work with slices
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [69]:
data.loc[:'Colorado', 'three']

Ohio        0
Colorado    6
Name: three, dtype: int64

In [70]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [71]:
data.iloc[:, :2][data.three > 6]

Unnamed: 0,one,two
Utah,8,9
New York,12,13


In [72]:
#boolean arrays work only with loc not iloc
data.loc[data.three >= 2]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [73]:
#boolean arrays work only with loc not iloc
data.loc[data.one > 0]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [74]:
#indexing options:
#1) df[column]
new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)))
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.414377,0.555749,0.748522,0.884688,0.830449,0.901752,0.365742,0.442807,0.745304,0.961479
1,0.112700,0.933878,0.793611,0.660924,0.916428,0.340284,0.125017,0.100929,0.672417,0.814755
2,0.838191,0.095266,0.406325,0.907652,0.316747,0.030372,0.073815,0.107944,0.624129,0.996248
3,0.030878,0.589898,0.273993,0.896262,0.619521,0.599462,0.450595,0.770240,0.959331,0.291015
4,0.128559,0.415480,0.946269,0.521383,0.930401,0.012278,0.493302,0.681141,0.993159,0.818061
...,...,...,...,...,...,...,...,...,...,...
95,0.900390,0.245188,0.592092,0.215989,0.352397,0.991765,0.230160,0.443585,0.445864,0.797837
96,0.578240,0.007426,0.588635,0.470837,0.173583,0.793422,0.135719,0.549531,0.246505,0.521871
97,0.822159,0.336903,0.351078,0.463786,0.347371,0.138300,0.881411,0.474775,0.436305,0.131774
98,0.340632,0.774625,0.343953,0.950158,0.296042,0.652944,0.082281,0.534841,0.838607,0.533761


In [75]:
#indexing options sample indexes
states_index = pd.Index(['New Jersey', 'New Mexico', 'Mississippi', 'Hawaii', 'Wyoming', 'Iowa',
       'Virginia', 'Arkansas', 'Alaska', 'Florida', 'Idaho', 'Iowa',
       'Colorado', 'Oregon', 'Wyoming', 'Washington', 'Oregon', 'Connecticut',
       'Nevada', 'Massachusetts', 'Arkansas', 'Arkansas', 'Vermont',
       'New Hampshire', 'Colorado', 'Iowa', 'Maine', 'Utah', 'Kentucky',
       'Alabama', 'Missouri', 'Connecticut', 'Ohio', 'Tennessee', 'Oregon',
       'Colorado', 'Kansas', 'Alabama', 'New Hampshire', 'Arkansas', 'Idaho',
       'Montana', 'South Carolina', 'Nebraska', 'Rhode Island', 'Tennessee',
       'Alaska', 'Louisiana', 'Utah', 'Oregon', 'Idaho', 'Virginia',
       'Louisiana', 'Colorado', 'Oregon', 'Massachusetts', 'North Carolina',
       'South Dakota', 'Alabama', 'Rhode Island', 'Oregon', 'Utah', 'Alaska',
       'Alabama', 'New Mexico', 'New York', 'Illinois', 'Idaho', 'Connecticut',
       'Nebraska', 'Nebraska', 'Arkansas', 'North Carolina', 'West Virginia',
       'Arkansas', 'Texas', 'Maine', 'Maryland', 'Virginia', 'Nebraska',
       'North Carolina', 'Wyoming', 'Iowa', 'Connecticut', 'Arizona',
       'Arizona', 'Tennessee', 'Alabama', 'Nebraska', 'Oklahoma', 'Arizona',
       'Rhode Island', 'Florida', 'Montana', 'Vermont', 'Minnesota',
       'Kentucky', 'Arizona', 'Rhode Island', 'Texas'])
columnNames = pd.Index(['get', 'only', 'chair', 'physical', 'late', 'TV', 'body', 'meeting',
       'present', 'left'])

In [76]:
#indexing options:
#1) df[column]
new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)),
                     index=states_index)
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
New Jersey,0.745053,0.529789,0.577715,0.133979,0.960705,0.158220,0.281996,0.964000,0.687259,0.871455
New Mexico,0.079040,0.872473,0.873590,0.909516,0.153908,0.423759,0.882077,0.336050,0.835434,0.228035
Mississippi,0.042239,0.332183,0.569570,0.899459,0.140887,0.508817,0.758790,0.470820,0.680529,0.006299
Hawaii,0.507256,0.875886,0.437920,0.344463,0.860147,0.281550,0.679554,0.241648,0.061419,0.144373
Wyoming,0.428548,0.130548,0.076651,0.089080,0.146424,0.344921,0.415549,0.674101,0.153288,0.592536
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.293082,0.788467,0.696858,0.184817,0.325530,0.226822,0.575910,0.351695,0.219618,0.841995
Kentucky,0.717538,0.836942,0.208035,0.047558,0.283102,0.197055,0.633819,0.353203,0.510266,0.111651
Arizona,0.165405,0.314710,0.362327,0.361745,0.211486,0.161434,0.415447,0.628322,0.065317,0.256080
Rhode Island,0.512781,0.446762,0.155805,0.783507,0.563850,0.846113,0.487403,0.667495,0.236400,0.109537


In [77]:
#indexing options:
#1) df[column]

new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)),
                     index=states_index,
                     columns=columnNames)
new_df

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
New Jersey,0.339565,0.499765,0.926195,0.735082,0.974739,0.607088,0.903145,0.107577,0.579743,0.262691
New Mexico,0.758118,0.634514,0.448062,0.762712,0.588183,0.998799,0.774013,0.461178,0.108110,0.545073
Mississippi,0.679022,0.204745,0.354847,0.700278,0.466422,0.742710,0.312107,0.023848,0.564242,0.297306
Hawaii,0.636110,0.738194,0.392923,0.789221,0.105264,0.793427,0.647479,0.183210,0.094148,0.218792
Wyoming,0.897273,0.711441,0.642518,0.733755,0.129529,0.995305,0.780974,0.441416,0.698702,0.133675
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.732109,0.818258,0.932185,0.842470,0.073675,0.743180,0.126933,0.612257,0.267782,0.936029
Kentucky,0.865005,0.582337,0.464105,0.561105,0.726414,0.591874,0.587980,0.457044,0.671201,0.948019
Arizona,0.404710,0.434271,0.658362,0.114221,0.859926,0.499117,0.506861,0.897447,0.356258,0.877340
Rhode Island,0.370934,0.445707,0.292811,0.848313,0.571564,0.311003,0.206431,0.030653,0.736060,0.979797


In [78]:
#indexing options:
#1) df[column]
new_df = pd.DataFrame(np.random.rand(1000).reshape((100, 10)),
                     index=states_index,
                     columns=columnNames)
new_df.reindex(sorted(columnNames), axis=1)

Unnamed: 0,TV,body,chair,get,late,left,meeting,only,physical,present
New Jersey,0.041895,0.732314,0.991752,0.339223,0.089463,0.070682,0.392083,0.236549,0.043386,0.545052
New Mexico,0.438871,0.437597,0.511570,0.064989,0.782460,0.455824,0.458617,0.011366,0.477877,0.310156
Mississippi,0.481227,0.207586,0.382694,0.264330,0.958557,0.371627,0.847110,0.705405,0.281249,0.935774
Hawaii,0.858755,0.562991,0.855853,0.378321,0.665382,0.846548,0.552291,0.000860,0.093033,0.674770
Wyoming,0.341842,0.945804,0.671552,0.403087,0.732717,0.043786,0.991814,0.769777,0.969271,0.441960
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.974630,0.523065,0.596628,0.685359,0.187991,0.289683,0.399357,0.925600,0.451889,0.638746
Kentucky,0.624921,0.252234,0.993722,0.412090,0.185371,0.173664,0.714215,0.942167,0.882256,0.470535
Arizona,0.054764,0.348575,0.242528,0.826082,0.623486,0.720443,0.686582,0.481026,0.478267,0.217659
Rhode Island,0.464843,0.131868,0.837024,0.795805,0.810176,0.607680,0.837716,0.241980,0.237042,0.356058


In [79]:
new_index = columnNames.sort_values() #we can call the sort_values method of the pd.Index object
new_df.reindex(new_index, axis=1)

Unnamed: 0,TV,body,chair,get,late,left,meeting,only,physical,present
New Jersey,0.041895,0.732314,0.991752,0.339223,0.089463,0.070682,0.392083,0.236549,0.043386,0.545052
New Mexico,0.438871,0.437597,0.511570,0.064989,0.782460,0.455824,0.458617,0.011366,0.477877,0.310156
Mississippi,0.481227,0.207586,0.382694,0.264330,0.958557,0.371627,0.847110,0.705405,0.281249,0.935774
Hawaii,0.858755,0.562991,0.855853,0.378321,0.665382,0.846548,0.552291,0.000860,0.093033,0.674770
Wyoming,0.341842,0.945804,0.671552,0.403087,0.732717,0.043786,0.991814,0.769777,0.969271,0.441960
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.974630,0.523065,0.596628,0.685359,0.187991,0.289683,0.399357,0.925600,0.451889,0.638746
Kentucky,0.624921,0.252234,0.993722,0.412090,0.185371,0.173664,0.714215,0.942167,0.882256,0.470535
Arizona,0.054764,0.348575,0.242528,0.826082,0.623486,0.720443,0.686582,0.481026,0.478267,0.217659
Rhode Island,0.464843,0.131868,0.837024,0.795805,0.810176,0.607680,0.837716,0.241980,0.237042,0.356058


In [80]:
new_df.loc['Montana']

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Montana,0.052584,0.858699,0.283633,0.956462,0.78192,0.680163,0.9134,0.817496,0.965756,0.959574
Montana,0.160692,0.820479,0.694909,0.143626,0.18627,0.058365,0.088567,0.660961,0.55117,0.970664


In [81]:
new_df.index

Index(['New Jersey', 'New Mexico', 'Mississippi', 'Hawaii', 'Wyoming', 'Iowa',
       'Virginia', 'Arkansas', 'Alaska', 'Florida', 'Idaho', 'Iowa',
       'Colorado', 'Oregon', 'Wyoming', 'Washington', 'Oregon', 'Connecticut',
       'Nevada', 'Massachusetts', 'Arkansas', 'Arkansas', 'Vermont',
       'New Hampshire', 'Colorado', 'Iowa', 'Maine', 'Utah', 'Kentucky',
       'Alabama', 'Missouri', 'Connecticut', 'Ohio', 'Tennessee', 'Oregon',
       'Colorado', 'Kansas', 'Alabama', 'New Hampshire', 'Arkansas', 'Idaho',
       'Montana', 'South Carolina', 'Nebraska', 'Rhode Island', 'Tennessee',
       'Alaska', 'Louisiana', 'Utah', 'Oregon', 'Idaho', 'Virginia',
       'Louisiana', 'Colorado', 'Oregon', 'Massachusetts', 'North Carolina',
       'South Dakota', 'Alabama', 'Rhode Island', 'Oregon', 'Utah', 'Alaska',
       'Alabama', 'New Mexico', 'New York', 'Illinois', 'Idaho', 'Connecticut',
       'Nebraska', 'Nebraska', 'Arkansas', 'North Carolina', 'West Virginia',
       'Arkansas',

In [82]:
#df[column]
new_df['get']

New Jersey      0.339223
New Mexico      0.064989
Mississippi     0.264330
Hawaii          0.378321
Wyoming         0.403087
                  ...   
Minnesota       0.685359
Kentucky        0.412090
Arizona         0.826082
Rhode Island    0.795805
Texas           0.454632
Name: get, Length: 100, dtype: float64

In [83]:
new_df.loc[['Montana'], 'chair']

Montana    0.283633
Montana    0.694909
Name: chair, dtype: float64

In [84]:
new_df.loc[['Montana'], ['body']] #using a sequence creates a DataFrame rather than a Series

Unnamed: 0,body
Montana,0.9134
Montana,0.088567


In [85]:
new_df.loc['Montana', ['physical']] #converting the first part into a string rather than a single-item list produces
#the same result

Unnamed: 0,physical
Montana,0.956462
Montana,0.143626


In [86]:
#df.loc[rows] select single row or subset of rows from the DataFrame by label
new_df.loc['Montana']

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Montana,0.052584,0.858699,0.283633,0.956462,0.78192,0.680163,0.9134,0.817496,0.965756,0.959574
Montana,0.160692,0.820479,0.694909,0.143626,0.18627,0.058365,0.088567,0.660961,0.55117,0.970664


In [87]:
#df.loc[rows] select single row or subset of rows from the DataFrame by label
new_df.loc[['Montana', 'Virginia']]

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Montana,0.052584,0.858699,0.283633,0.956462,0.78192,0.680163,0.9134,0.817496,0.965756,0.959574
Montana,0.160692,0.820479,0.694909,0.143626,0.18627,0.058365,0.088567,0.660961,0.55117,0.970664
Virginia,0.484601,0.961611,0.738785,0.31078,0.90833,0.045776,0.948943,0.188796,0.058048,0.427934
Virginia,0.127417,0.101614,0.978705,0.670355,0.266097,0.430786,0.594528,0.35917,0.161186,0.608026
Virginia,0.963268,0.335037,0.372593,0.745663,0.501691,0.230974,0.144117,0.872877,0.330392,0.255349


In [88]:
#df.loc[rows, cols] : select both rows and columns by label
new_df.loc['Rhode Island', 'only']

Rhode Island    0.203818
Rhode Island    0.287776
Rhode Island    0.218942
Rhode Island    0.241980
Name: only, dtype: float64

In [89]:
#df.loc[rows, cols]
new_df.loc['Rhode Island', ['only', 'present']] 

Unnamed: 0,only,present
Rhode Island,0.203818,0.404295
Rhode Island,0.287776,0.986014
Rhode Island,0.218942,0.087321
Rhode Island,0.24198,0.356058


In [90]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island'], ['only', 'present']] #same as above

Unnamed: 0,only,present
Rhode Island,0.203818,0.404295
Rhode Island,0.287776,0.986014
Rhode Island,0.218942,0.087321
Rhode Island,0.24198,0.356058


In [91]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island', 'Virginia'], ['only']]

Unnamed: 0,only
Rhode Island,0.203818
Rhode Island,0.287776
Rhode Island,0.218942
Rhode Island,0.24198
Virginia,0.961611
Virginia,0.101614
Virginia,0.335037


In [92]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island', 'Virginia'], 'only']

Rhode Island    0.203818
Rhode Island    0.287776
Rhode Island    0.218942
Rhode Island    0.241980
Virginia        0.961611
Virginia        0.101614
Virginia        0.335037
Name: only, dtype: float64

In [93]:
#df.loc[rows, cols]
new_df.loc[['Rhode Island', 'Virginia'], ['physical', 'present']]

Unnamed: 0,physical,present
Rhode Island,0.080614,0.404295
Rhode Island,0.553281,0.986014
Rhode Island,0.543793,0.087321
Rhode Island,0.237042,0.356058
Virginia,0.31078,0.058048
Virginia,0.670355,0.161186
Virginia,0.745663,0.330392


In [94]:
#df.iloc[rows] 
new_df.iloc[0]

get         0.339223
only        0.236549
chair       0.991752
physical    0.043386
late        0.089463
TV          0.041895
body        0.732314
meeting     0.392083
present     0.545052
left        0.070682
Name: New Jersey, dtype: float64

In [95]:
new_df.iloc[2:]

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Mississippi,0.264330,0.705405,0.382694,0.281249,0.958557,0.481227,0.207586,0.847110,0.935774,0.371627
Hawaii,0.378321,0.000860,0.855853,0.093033,0.665382,0.858755,0.562991,0.552291,0.674770,0.846548
Wyoming,0.403087,0.769777,0.671552,0.969271,0.732717,0.341842,0.945804,0.991814,0.441960,0.043786
Iowa,0.356546,0.479494,0.456726,0.618647,0.840504,0.382721,0.741692,0.939225,0.423777,0.039643
Virginia,0.484601,0.961611,0.738785,0.310780,0.908330,0.045776,0.948943,0.188796,0.058048,0.427934
...,...,...,...,...,...,...,...,...,...,...
Minnesota,0.685359,0.925600,0.596628,0.451889,0.187991,0.974630,0.523065,0.399357,0.638746,0.289683
Kentucky,0.412090,0.942167,0.993722,0.882256,0.185371,0.624921,0.252234,0.714215,0.470535,0.173664
Arizona,0.826082,0.481026,0.242528,0.478267,0.623486,0.054764,0.348575,0.686582,0.217659,0.720443
Rhode Island,0.795805,0.241980,0.837024,0.237042,0.810176,0.464843,0.131868,0.837716,0.356058,0.607680


In [96]:
#df.iloc[rows]
new_df.iloc[25:29]

Unnamed: 0,get,only,chair,physical,late,TV,body,meeting,present,left
Iowa,0.665775,0.731147,0.380001,0.932737,0.938612,0.644122,0.82995,0.366772,0.346656,0.592029
Maine,0.385322,0.139374,0.156711,0.528226,0.195244,0.02297,0.460666,0.667872,0.061425,0.370387
Utah,0.995141,0.206183,0.706966,0.42419,0.503415,0.91328,0.342759,0.224722,0.602184,0.002915
Kentucky,0.572812,0.574182,0.860961,0.688659,0.00555,0.937178,0.224805,0.304773,0.421769,0.61065


In [97]:
#df.iloc[:, cols]
new_df.iloc[:, 1]

New Jersey      0.236549
New Mexico      0.011366
Mississippi     0.705405
Hawaii          0.000860
Wyoming         0.769777
                  ...   
Minnesota       0.925600
Kentucky        0.942167
Arizona         0.481026
Rhode Island    0.241980
Texas           0.544460
Name: only, Length: 100, dtype: float64

In [98]:
#new_df.iloc[:, columns]
new_df.iloc[:, -1]

New Jersey      0.070682
New Mexico      0.455824
Mississippi     0.371627
Hawaii          0.846548
Wyoming         0.043786
                  ...   
Minnesota       0.289683
Kentucky        0.173664
Arizona         0.720443
Rhode Island    0.607680
Texas           0.385046
Name: left, Length: 100, dtype: float64

In [99]:
#new_df.iloc[:, columns]
new_df.iloc[:, -2:]

Unnamed: 0,present,left
New Jersey,0.545052,0.070682
New Mexico,0.310156,0.455824
Mississippi,0.935774,0.371627
Hawaii,0.674770,0.846548
Wyoming,0.441960,0.043786
...,...,...
Minnesota,0.638746,0.289683
Kentucky,0.470535,0.173664
Arizona,0.217659,0.720443
Rhode Island,0.356058,0.607680


In [100]:
#df.iloc[rows, cols]
new_df.iloc[2, 4] #one item as a scalar

0.9585565625018238

In [101]:
#pattern is df.iloc[int, List[int]]
new_df.iloc[2, [4]] #displays the index when we put the second item in a list

late    0.958557
Name: Mississippi, dtype: float64

In [102]:
#pattern is df.iloc[List[int], int]
new_df.iloc[[2], 4] #same as above 

Mississippi    0.958557
Name: late, dtype: float64

In [103]:
#pattern is df.iloc[List[int], List[int]]
new_df.iloc[[2], [4]] #same as above 

Unnamed: 0,late
Mississippi,0.958557


In [104]:
#pattern is df.iloc[List[int, int], List[int]]
new_df.iloc[[2, 3], [4]] #dataframe with two rows one column

Unnamed: 0,late
Mississippi,0.958557
Hawaii,0.665382


In [105]:
new_df.iloc[2, [4, 5]] #two items

late    0.958557
TV      0.481227
Name: Mississippi, dtype: float64

In [106]:
new_df.iloc[[2], [4, 5]] #DataFrame with two columns one row

Unnamed: 0,late,TV
Mississippi,0.958557,0.481227


In [107]:
new_df.iloc[[2, 1], [4, 5]] #DataFrame with two 

Unnamed: 0,late,TV
Mississippi,0.958557,0.481227
New Mexico,0.78246,0.438871


In [108]:
#new_df.iloc[:, columns]
new_df.iloc[:, [3]] #second item as a list makes a DataFrame

Unnamed: 0,physical
New Jersey,0.043386
New Mexico,0.477877
Mississippi,0.281249
Hawaii,0.093033
Wyoming,0.969271
...,...
Minnesota,0.451889
Kentucky,0.882256
Arizona,0.478267
Rhode Island,0.237042


In [109]:
#new_df.iloc[:, columns]
#pattern is new_df.iloc[..., List[int]]
new_df.iloc[..., [3]] #second item as a list makes a DataFrame

Unnamed: 0,physical
New Jersey,0.043386
New Mexico,0.477877
Mississippi,0.281249
Hawaii,0.093033
Wyoming,0.969271
...,...
Minnesota,0.451889
Kentucky,0.882256
Arizona,0.478267
Rhode Island,0.237042


In [110]:
#df.at[row, col] #select a single scalar value by row and column label
new_df.at['New Jersey', 'physical']

0.043386192656261535

In [111]:
#df.at doesn't work with lists, only individual values
new_df.at['New Mexico', 'physical'] #returns more than one row if we have more than onel

New Mexico    0.477877
New Mexico    0.803991
Name: physical, dtype: float64

In [112]:
#df.iat[row, column]
new_df.iat[0, 0]

0.33922340224421765

In [113]:
#df.iat[row, column]
new_df.iat[-1, -1]

0.3850457495389401

In [114]:
#df.reindex method : select either rows or columns by their labels
print('columns', new_df.columns)

columns Index(['get', 'only', 'chair', 'physical', 'late', 'TV', 'body', 'meeting',
       'present', 'left'],
      dtype='object')


In [115]:
new_df.reindex(['get', 'only', 'chair'], axis=1)

Unnamed: 0,get,only,chair
New Jersey,0.339223,0.236549,0.991752
New Mexico,0.064989,0.011366,0.511570
Mississippi,0.264330,0.705405,0.382694
Hawaii,0.378321,0.000860,0.855853
Wyoming,0.403087,0.769777,0.671552
...,...,...,...
Minnesota,0.685359,0.925600,0.596628
Kentucky,0.412090,0.942167,0.993722
Arizona,0.826082,0.481026,0.242528
Rhode Island,0.795805,0.241980,0.837024


In [116]:
new_df.reindex(['get', 'only', 'chair'], axis=1)

Unnamed: 0,get,only,chair
New Jersey,0.339223,0.236549,0.991752
New Mexico,0.064989,0.011366,0.511570
Mississippi,0.264330,0.705405,0.382694
Hawaii,0.378321,0.000860,0.855853
Wyoming,0.403087,0.769777,0.671552
...,...,...,...
Minnesota,0.685359,0.925600,0.596628
Kentucky,0.412090,0.942167,0.993722
Arizona,0.826082,0.481026,0.242528
Rhode Island,0.795805,0.241980,0.837024


In [117]:
first_three_columns = ['get', 'only', 'chair']
new_df.reindex(first_three_columns, axis='columns') #use reindex to effectively drop rows

Unnamed: 0,get,only,chair
New Jersey,0.339223,0.236549,0.991752
New Mexico,0.064989,0.011366,0.511570
Mississippi,0.264330,0.705405,0.382694
Hawaii,0.378321,0.000860,0.855853
Wyoming,0.403087,0.769777,0.671552
...,...,...,...
Minnesota,0.685359,0.925600,0.596628
Kentucky,0.412090,0.942167,0.993722
Arizona,0.826082,0.481026,0.242528
Rhode Island,0.795805,0.241980,0.837024


In [118]:
first_three_columns = ['get', 'only', 'chair']
new_df.reindex(first_three_columns, axis=1)

Unnamed: 0,get,only,chair
New Jersey,0.339223,0.236549,0.991752
New Mexico,0.064989,0.011366,0.511570
Mississippi,0.264330,0.705405,0.382694
Hawaii,0.378321,0.000860,0.855853
Wyoming,0.403087,0.769777,0.671552
...,...,...,...
Minnesota,0.685359,0.925600,0.596628
Kentucky,0.412090,0.942167,0.993722
Arizona,0.826082,0.481026,0.242528
Rhode Island,0.795805,0.241980,0.837024


In [119]:
#if we have an index with duplicate labels, than we cannot reindex
try:
    first_three_states = ['Alabama', 'Alaska', 'Arizona']
    new_df.reindex(first_three_states, axis=0)
except ValueError as e:
    logging.critical(e)

  new_df.reindex(first_three_states, axis=0)
CRITICAL:root:cannot reindex on an axis with duplicate labels


In [120]:
#integer indexing pitfalls and issues: Pandas cannot tell if an Integer Index is label-based
#or position-based. Therefore, Pandas treats integer indices as label-based to be safe
#the code below will raise an error
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [121]:
try:
    ser[-1]
except KeyError as e:
    logging.critical(e)

CRITICAL:root:-1


In [122]:
ser #pandas doesn't want to guess if it is label or integer based indexing

0    0.0
1    1.0
2    2.0
dtype: float64

In [123]:
#noninteger index does not have this ambiguity
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [124]:
#this won't have this problem
ser2[-1]

2.0

In [125]:
#use loc or iloc to index an axis with integers
ser.iloc[-1]

2.0

In [126]:
#slicing with integers always is integer-oriented
ser[:2]
#AVOID AMBIGUITY: Use .loc and .iloc

0    0.0
1    1.0
dtype: float64

In [127]:
#Chained indexing pitfalls:
#1a) view DataFrame before modification
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [128]:
#Note: data['one'] access the column directly with a label.
data['one']

Ohio         0
Colorado     0
Utah         8
New York    12
Name: one, dtype: int64

In [129]:
#1b) View data in its original form.
data.loc[:, 'one']

Ohio         0
Colorado     0
Utah         8
New York    12
Name: one, dtype: int64

In [130]:
#1c) Assign every item in column "one" to the value of 1.
data.loc[:, 'one'] = 1

In [131]:
#1d) View modified data
data #every item in the DataFrame is set to one

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,1,9,10,11
New York,1,13,14,15


In [132]:
#2) Here we set each value in row number 2 (starting from 0, named "Utah") to 5. 
#2a) View data in its original form
data.iloc[2]

one       1
two       9
three    10
four     11
Name: Utah, dtype: int64

In [133]:
#2) Set the value of each item in row 2 (starting from 0, named "Utah") to 5.
#2b) Code that sets the value in place
data.iloc[2] = 5

In [134]:
#2b) Just as a note, we can get the exact same view of column three by using data.loc['Utah'].
data.loc['Utah']

one      5
two      5
three    5
four     5
Name: Utah, dtype: int64

In [135]:
#3a) To illustrate what is happening, let's view the boolean array we use for indexing
data['four'] > 5 #take the column labelled "four" and return a boolean series. True if the row value is
#greater than five, false if it is not. 


Ohio        False
Colorado     True
Utah        False
New York     True
Name: four, dtype: bool

In [136]:
#3b) To further illustrate what is happening, let's get the rows where the value of column 'four' is >= 5
data.loc[data['four'] > 5] #notice we do not get any results from Ohio because data.loc['Ohio', 'Four'] there is less than 5

Unnamed: 0,one,two,three,four
Colorado,1,5,6,7
New York,1,13,14,15


In [137]:
#3cß) Finally, let us use the code which assigns the value 3 to every item in column 4 that is larger than 3.
data.loc[data['four'] > 5] = 3

In [138]:
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [139]:
#some more examples from indexing row 0 (Ohio) using .loc syntax
data.loc['Ohio', ['four', 'three']]

four     0
three    0
Name: Ohio, dtype: int64

In [140]:
data.loc['Ohio', ['three', 'four']]

three    0
four     0
Name: Ohio, dtype: int64

In [141]:
data.loc[['Ohio', 'New York'], 'one']

Ohio        1
New York    3
Name: one, dtype: int64

In [142]:
#let's use iloc
data.iloc[0, [3, 2]]

four     0
three    0
Name: Ohio, dtype: int64

In [143]:
#common gotcha: chaining selections when assigning
data.loc[data.three == 5]['three'] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.three == 5]['three'] = 6


In [144]:
#we are inadvertantly modifying a temporary value
#the data here is not modified
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [146]:
#Solution: Use .loc to create a single operation. Thus no temporary values that need modification.
#1a) Display the whole row
data.loc[data.three == 5]

Unnamed: 0,one,two,three,four
Utah,5,5,5,5


In [147]:
#1b) Display the value of row 5 in column three
data.loc[data.three == 5, 'three']

Utah    5
Name: three, dtype: int64

In [149]:
#1c) Set the value of row 5 in column three to 3
data.loc[data.three == 5, 'three'] = 6

In [150]:
data #notice we modified the value

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


In [None]:
#1d) Let's modify the value to 7 using .iloc syntax
data.iloc[]