# Retrieving single columns and rows

In [1]:
import numpy as np
 
# Read the dataset, select only first 5 rows
import pandas as pd
df = pd.read_csv('weather.csv').head()
df

Unnamed: 0,MONTH,DAY,TIME,TEMP,PRESSURE
0,1,1,1,6.8,10207
1,1,1,2,5.8,10214
2,1,1,3,5.7,10220
3,1,1,4,6.0,10225
4,1,1,5,4.5,10230


In [2]:
# In a sense, dataframe is nothing more than a list of Series containing values.
# Each Series is one of the columns in dataframe. 

# Selecting a single column. This returns a Series.
df['TEMP']

# If we use a column name that does not exist, we would get an error.
# Since a series is returned, we can execute all the methods that are available on Series object.

0    6.8
1    5.8
2    5.7
3    6.0
4    4.5
Name: TEMP, dtype: float64

In [5]:
type(df['TEMP'])

pandas.core.series.Series

In [6]:
# On the result, we can use another index to retrieve a row
print(df['TEMP'][1]) #Selecting an individual cell. 1st we extract a Series and then we use another
#Square brackets to select value by its index.

#Very Imp: When we use this double brackets syntax to retrieve a single cell ,
# we specify the Column in the 1st set of brackets and then the Row label in the 2nd set of brackets. 
# Ex: DF[columnLabel][rowlabel]
#
# Note that this order is diferent when using methods like Loc.
#
# Another caveat is that when we specify a list of columns in brackets , we get a dataframe back and not
# a Series. In that case the syntax DF[[columnlabel1,columnlabel2]][Rowlabel] will not work. 
# Square brackets after a dataframe expect a column label and not row label.

#Performing operations on a Series object
print(df['TEMP'].sum())

5.8
28.8


In [8]:
# Don't use this -- it's bad practice. Use df['TEMP']
# Note that this does work, just that its a bad practice. 
# This works only when column name is a valid python attribute name. 
# this approach causes issues when column names have spaces, special characters , dots in them

# Notice that this syntax for retrieving an attribute from python object.
# For our convenience , dataframe has attributes for each of its columns.
df.TEMP

# Another drawback of this syntax is that we cannot add a new column . HOwever its possible using the
# brackets syntax. Ex: df['newTEmp']= df['TEMP'] - 2 will work, but df.newTemp cannot be used on the 
# left hand side of the equality operator.

0    6.8
1    5.8
2    5.7
3    6.0
4    4.5
Name: TEMP, dtype: float64

In [9]:
# Transpose: swap rows/columns
dft = df.T  #T is an attribute. It returns a new dataframe that is the transposed version of the original one.
dft
# In this transposed dataframe , the row labels are strings and column labels are numeric.

Unnamed: 0,0,1,2,3,4
MONTH,1.0,1.0,1.0,1.0,1.0
DAY,1.0,1.0,1.0,1.0,1.0
TIME,1.0,2.0,3.0,4.0,5.0
TEMP,6.8,5.8,5.7,6.0,4.5
PRESSURE,10207.0,10214.0,10220.0,10225.0,10230.0


In [19]:
dft.columns

RangeIndex(start=0, stop=5, step=1)

In [18]:
# Again, retrieve a column by its label - in this case column labels are ints
# Then retrieve the row by label
dft[2]['TIME']

3.0

In [None]:
# Since the column label is numeric, we cannot use the syntax df.2
# This is because there is no attribute of the name 2 in the dataframe.

In [7]:
# Imp: Rows can also be retrieved by position (not true for columns)
# Rows can be extracted using index labels and index positions. However that is not the case with Columns.
# Columns can only be extacted using index labels. 

#In below example, it works because the dataframe has have numeric column index
#2 here is not index position, its the label itself.
dft[2][2]

3.0

In [8]:
t = pd.DataFrame([['John'], ['Bob'], ['Anne']], index=[4,3,4]) #Here same index is used for 2 rows.
t
# Important to note that when we create a dataframe and we dont specify the column labels,
# column labels default to numeric labels (0,1,2 etc)

Unnamed: 0,0
4,John
3,Bob
4,Anne


In [9]:
# There are 2 rows with label 4 
# Index has type int, so 4 is a label, not a position
t[0][4] #t[0] extracts a Series as 0 is the column lavel.

4    John
4    Anne
Name: 0, dtype: object

# Indexing with lists and slices

In [10]:
df

Unnamed: 0,MONTH,DAY,TIME,TEMP,PRESSURE
0,1,1,1,6.8,10207
1,1,1,2,5.8,10214
2,1,1,3,5.7,10220
3,1,1,4,6.0,10225
4,1,1,5,4.5,10230


In [20]:
# Retrieving multiple columns in any order
# Note the double square brackets
df[['PRESSURE', 'TIME', 'TEMP']]  #Note that when we specified a list. We can specify columns in any order

Unnamed: 0,PRESSURE,TIME,TEMP
0,10207,1,6.8
1,10214,2,5.8
2,10220,3,5.7
3,10225,4,6.0
4,10230,5,4.5


In [21]:
# Retrieving multiple rows in any order
# Note the double square brackets
df['TIME'][[3,1,4]] # Note that we have specified a list of row lavels

3    4
1    2
4    5
Name: TIME, dtype: int64

In [13]:
# Notice the difference when we use Slicing. When slicing is used, 1st brackets are considered as Rows.
# Using a slice always selects rows
# Then we use a list to retrieve multiple columns
# Note the double square brackets
df[2:4][['TEMP', 'PRESSURE']]

Unnamed: 0,TEMP,PRESSURE
2,5.7,10220
3,6.0,10225


In [14]:
# Similar operation on the transposed dataset
dft[3:][[2,3]] #Again .. when slicing is used, its applied to row labels.

Unnamed: 0,2,3
TEMP,5.7,6.0
PRESSURE,10220.0,10225.0


In [15]:
# Retrieving a column, then using a slice to get rows
df['PRESSURE'][:4] #Slicing applies to rows.

0    10207
1    10214
2    10220
3    10225
Name: PRESSURE, dtype: int64

In [16]:
# Again, we can get to rows both by position and label
# This select the first two rows even though the index is of type string
dft[:2]

Unnamed: 0,0,1,2,3,4
MONTH,1.0,1.0,1.0,1.0,1.0
DAY,1.0,1.0,1.0,1.0,1.0


In [22]:
# We can also use slices with strings
# If the index is of type string
dft['TIME':'PRESSURE'] #When slicing is used with string row labels, the 2nd index is included in the result.
# This behaviour is different from when we use slicing using index position.
# When Sliced using index position the 2nd index is not included in the result.

Unnamed: 0,0,1,2,3,4
TIME,1.0,2.0,3.0,4.0,5.0
TEMP,6.8,5.8,5.7,6.0,4.5
PRESSURE,10207.0,10214.0,10220.0,10225.0,10230.0


# Using loc and iloc

In [26]:
capitals = pd.DataFrame(
    [
    ["Ngerulmud",391,1.87],
    ["Vatican City",826,100],
    ["Yaren",1100,10.91],
    ["Funafuti",4492,45.48],
    ["City of San Marino",4493]
    ], 
    index = ["Palau", "Vatican City", "Nauru", "Tuvalu", "San Marino"],  #specifies index labels
    columns=['Capital', 'Population', 'Percentage'])  #specifies column labels.

In [27]:
capitals

Unnamed: 0,Capital,Population,Percentage
Palau,Ngerulmud,391,1.87
Vatican City,Vatican City,826,100.0
Nauru,Yaren,1100,10.91
Tuvalu,Funafuti,4492,45.48
San Marino,City of San Marino,4493,


In [20]:
# Loc does row-based indexing. Recollect that [] allowed using column based indexing.
# And allows to select both row and column in 1 operation
capitals.loc['Nauru', 'Population'] #Note that here the 1st argument is the row label and 2nd is column label.

1100

In [28]:
# Getting the same data without loc
capitals['Population']['Nauru'] #This is the equivalent of the above using the [] .i.e column based indexing
# Note that this takes 2 operations, while loc only takes a single operation.

1100

In [29]:
# loc works with lists and slices as well
# Both arguments of loc support Lists and Slicing. We can mix match slicing, indexing and lists.
capitals.loc['Palau':'Nauru', ['Population', 'Percentage']]

Unnamed: 0,Population,Percentage
Palau,391,1.87
Vatican City,826,100.0
Nauru,1100,10.91


In [23]:
# Leaving out the column selects all columns
capitals.loc[['San Marino', 'Vatican City']]

Unnamed: 0,Capital,Population,Percentage
San Marino,City of San Marino,4493,
Vatican City,Vatican City,826,100.0


In [24]:
# iloc works similar to loc, but with positions instead of labels
capitals.iloc[[4,1], 1:]
# loc is used to select data using index,column labels
# iloc is used to select data using index and column positions.
# Note that with iloc , we can use column positions. 
# Note that as we have seen earlier, When using column indexing .i.e [],we cant use column positions

Unnamed: 0,Population,Percentage
San Marino,4493,
Vatican City,826,100.0


In [25]:
# With iloc we can do something we couldn't do before:
# Retrieve a column by position
capitals.iloc[:,2]

Palau             1.87
Vatican City    100.00
Nauru            10.91
Tuvalu           45.48
San Marino         NaN
Name: Percentage, dtype: float64

In [None]:
# Note that loc and iloc are attributes and not methods. 

# Boolean Filtering

In [30]:
# Indexing with a 'regular' list retrieves columns
capitals[['Capital', 'Population']]

Unnamed: 0,Capital,Population
Palau,Ngerulmud,391
Vatican City,Vatican City,826
Nauru,Yaren,1100
Tuvalu,Funafuti,4492
San Marino,City of San Marino,4493


In [27]:
# But indexing with boolean list retrieves rows
# Condition: the list has to contain as many elements as there are rows
capitals[[True, True, False, True, False]]

Unnamed: 0,Capital,Population,Percentage
Palau,Ngerulmud,391,1.87
Vatican City,Vatican City,826,100.0
Tuvalu,Funafuti,4492,45.48


In [32]:
# Comparing a series to a value gives a list of booleans
print(type(capitals['Percentage'] > 25))
capitals['Percentage'] > 25

<class 'pandas.core.series.Series'>


Palau           False
Vatican City     True
Nauru           False
Tuvalu           True
San Marino      False
Name: Percentage, dtype: bool

In [29]:
# So we can use this list of booleans as an index
# To retrieve only the rows for which the comparison is True
capitals[capitals['Percentage'] > 25]

Unnamed: 0,Capital,Population,Percentage
Vatican City,Vatican City,826,100.0
Tuvalu,Funafuti,4492,45.48


In [33]:
grades = pd.DataFrame([[6, 4], [7, 8], [6, 7], [6, 5], [5, 2]], 
                       index = ['Mary', 'John', 'Ann', 'Pete', 'Laura'],
                       columns = ['test_1', 'test_2'])
grades

Unnamed: 0,test_1,test_2
Mary,6,4
John,7,8
Ann,6,7
Pete,6,5
Laura,5,2


In [31]:
# We can compare across columns. This will also return a Series of bools.
grades['test_2'] <= grades['test_1']

Mary      True
John     False
Ann      False
Pete      True
Laura     True
dtype: bool

In [32]:
# And again, use that as input for the index operator
grades[grades['test_2'] <= grades['test_1']]

Unnamed: 0,test_1,test_2
Mary,6,4
Pete,6,5
Laura,5,2


In [36]:
# And you can use lists/Series of booleans with loc and iloc too
# We can use this boolean Series/list for both ROws and Columns selection
# Only condition is that number of elements in the Boolean Series/list should match the number or Rows or Columns.

print(grades.mean()) # The mean methods on dataframe gives a Series with means on all the columns in df
# This list can be converted to Boolean Series, using the comparision operation.

print(grades.mean() > 5.5) # Gives a Series of booleans.

grades.loc[:, grades.mean() > 5.5] # We can use this to only select the columns whose mean is greater than 5.5



test_1    6.0
test_2    5.2
dtype: float64
test_1     True
test_2    False
dtype: bool


Unnamed: 0,test_1
Mary,6
John,7
Ann,6
Pete,6
Laura,5


# Assigning values

In [34]:
grades

Unnamed: 0,test_1,test_2
Mary,6,4
John,7,8
Ann,6,7
Pete,6,5
Laura,5,2


In [35]:
# We can assign new values to the data we have selected with an index
grades.loc[['Laura', 'John'], 'test_2'] += 1
# Here we can see that we can change values using loc,iloc 
# Same could also be done using column indexing []

In [36]:
grades

Unnamed: 0,test_1,test_2
Mary,6,4
John,7,9
Ann,6,7
Pete,6,5
Laura,5,3


In [37]:
# Updating an entire column
grades['test_1'] += .5  # Same
grades

Unnamed: 0,test_1,test_2
Mary,6.5,4
John,7.5,8
Ann,6.5,7
Pete,6.5,5
Laura,5.5,2


In [38]:
# Or an entire row
grades.loc['Mary'] += 2
grades

Unnamed: 0,test_1,test_2
Mary,8.5,6.0
John,7.5,9.0
Ann,6.5,7.0
Pete,6.5,5.0
Laura,5.5,3.0


In [39]:
# Or setting multiple values at once
grades.loc['Pete'] = [7,8]
grades

Unnamed: 0,test_1,test_2
Mary,8.5,6.0
John,7.5,9.0
Ann,6.5,7.0
Pete,7.0,8.0
Laura,5.5,3.0


In [40]:
# If necessary, we first save the boolean lists to variables
failing = grades < 6
passing = grades >= 6
grades[failing] = "Fail"
grades[passing] = "Pass"
grades

Unnamed: 0,test_1,test_2
Mary,Pass,Pass
John,Pass,Pass
Ann,Pass,Pass
Pete,Pass,Pass
Laura,Fail,Fail


In [41]:
grades = pd.DataFrame([[6, 4], [7, 8], [6, 7], [6, 5], [5, 2]], 
                      index = ['Mary', 'John', 'Ann', 'Pete', 'Laura'],
                      columns = ['test_1', 'test_2'])
grades

Unnamed: 0,test_1,test_2
Mary,6,4
John,7,8
Ann,6,7
Pete,6,5
Laura,5,2


In [38]:
# Creating a new column is simple. Simple use a new column name to add a new column.
grades['passed'] = grades.mean(axis=1) > 6
# We have seen several method like mean,sum etc. When we execute these on a dataframe, they provide
# mean for all the eligible columns.
# However if we wanted to perform the same operation per row., we could use the axis=1 parameter.
# the axis=0 is default. This performs operations per column.

In [39]:
grades

Unnamed: 0,test_1,test_2,passed
Mary,6.5,4,False
John,7.5,8,True
Ann,6.5,7,True
Pete,6.5,5,False
Laura,5.5,2,False


In [57]:
# But watch out - this creates a column by mistake!
grades['Ann', 'test_2'] = 8
grades

Unnamed: 0,test_1,test_2,passed,"(Ann, test_2)"
Mary,6,4,False,8
John,7,8,True,8
Ann,6,9,True,8
Pete,6,5,False,8
Laura,5,2,False,8


In [58]:
# This is a better way to do this
grades.loc['Ann', 'test_2'] = 8
grades

Unnamed: 0,test_1,test_2,passed,"(Ann, test_2)"
Mary,6,4,False,8
John,7,8,True,8
Ann,6,8,True,8
Pete,6,5,False,8
Laura,5,2,False,8


In [46]:
# This is called 'chained indexing' and assignment is not guarantueed to work
# Using loc is preferred. Imp
grades['test_2']['Ann'] = 9
# Assigning using chained indexing may not work. Another reason to use the loc and iloc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Sorting

In [48]:
# Sort by index
# This returns a copy of the data. Original Dataframe is not impacted.
capitals.sort_index()

Unnamed: 0,Capital,Population,Percentage
Nauru,Yaren,1100,10.91
Palau,Ngerulmud,391,1.87
San Marino,City of San Marino,4493,
Tuvalu,Funafuti,4492,45.48
Vatican City,Vatican City,826,100.0


In [51]:
# To change the original data: use inplace=True
# To sort in reverse, use ascending=False
capitals.sort_index(inplace=True, ascending=False)
capitals

Unnamed: 0,Capital,Population,Percentage
Vatican City,Vatican City,826,100.0
Tuvalu,Funafuti,4492,45.48
San Marino,City of San Marino,4493,
Palau,Ngerulmud,391,1.87
Nauru,Yaren,1100,10.91


In [52]:
# To sort rows instead of columns, use axis=1
capitals.sort_index(axis=1)  # Important : Columns can also be sorted. Say we want all columns sorted by their names.
#This shall help to get similar looking columns next to each other.

Unnamed: 0,Capital,Percentage,Population
Vatican City,Vatican City,100.0,826
Tuvalu,Funafuti,45.48,4492
San Marino,City of San Marino,,4493
Palau,Ngerulmud,1.87,391
Nauru,Yaren,10.91,1100


In [54]:
# sort_values sorts by data values
# It needs the argument 'by': which column to sort by
capitals.sort_values(by='Percentage')
# Using sort_values without any argument gives error.

Unnamed: 0,Capital,Population,Percentage
Palau,Ngerulmud,391,1.87
Nauru,Yaren,1100,10.91
Tuvalu,Funafuti,4492,45.48
Vatican City,Vatican City,826,100.0
San Marino,City of San Marino,4493,


In [55]:
# Or you can sort by multiple columns
grades.sort_values(by=['test_1', 'test_2'])

Unnamed: 0,test_1,test_2,passed,"(Ann, test_2)"
Laura,5,2,False,8
Mary,6,4,False,8
Pete,6,5,False,8
Ann,6,9,True,8
John,7,8,True,8
