# Retrieving single columns and rows

In [None]:
import numpy as np
 
# Read the dataset, select only first 5 rows
import pandas as pd
df = pd.read_csv('weather.csv').head()
df

In [None]:
# Selecting a single column
df['TEMP']

In [None]:
# On the result, we can use another index to retrieve a row
df['TEMP'][1]

In [None]:
# Don't use this -- it's bad practice. Use df['TEMP']
df.TEMP

In [None]:
# Transpose: swap rows/columns
dft = df.T
dft

In [None]:
# Again, retrieve a column by its label - in this case column labels are ints
# Then retrieve the row by label
dft[2]['TIME']

In [None]:
# Rows can also be retrieved by position (not true for columns)
df['TEMP'][2]

# Indexing with lists and slices

In [None]:
df

In [None]:
# Retrieving multiple columns in any order
# Note the double square brackets
df[['PRESSURE', 'TIME', 'TEMP']]

In [None]:
# Retrieving multiple rows in any order
# Note the double square brackets
df['TIME'][[3,3,3,1,4]]

In [None]:
df[1:3]

In [None]:
# Using a slice always selects rows
# Then we use a list to retrieve multiple columns
# Note the double square brackets
df[2:4][['TEMP', 'PRESSURE']]

In [None]:
# Similar operation on the transposed dataset
dft[3:][[2,3]]

In [None]:
# Retrieving a column, then using a slice to get rows
df['PRESSURE'][:4]

In [None]:
# Again, we can get to rows both by position and label
# This select the first two rows even though the index is of type string
dft[:2]

In [None]:
# We can also use slices with strings
# If the index is of type string
dft['TIME':'PRESSURE']

# Using loc and iloc

In [None]:
capitals = pd.DataFrame(
    [
    ["Ngerulmud",391,1.87],
    ["Vatican City",826,100],
    ["Yaren",1100,10.91],
    ["Funafuti",4492,45.48],
    ["City of San Marino",4493]
    ], 
    index = ["Palau", "Vatican City", "Nauru", "Tuvalu", "San Marino"],
    columns=['Capital', 'Population', 'Percentage'])

In [None]:
capitals

In [None]:
# Loc does row-based indexing
# And allows to select both row and column in 1 operation
capitals.loc['Nauru', 'Population']

In [None]:
# Getting the same data without loc
capitals['Population']['Nauru']

In [None]:
# loc works with lists and slices as well
capitals.loc['Palau':'Nauru', ['Population', 'Percentage']]

In [None]:
# Leaving out the column selects all columns
capitals.loc[['San Marino', 'Vatican City']]

In [None]:
capitals.iloc[2]

In [None]:
# iloc works similar to loc, but with positions instead of labels
capitals.iloc[[4,1], 1:]

In [None]:
# With iloc we can do something we couldn't do before:
# Retrieve a column by position
capitals.iloc[:,2]

# Boolean Filtering

In [None]:
# Indexing with a 'regular' list retrieves columns
capitals[['Capital', 'Population']]

In [None]:
# But indexing with boolean list retrieves rows
# Condition: the list has to contain as many elements as there are rows
capitals[[True, True, False, True, False]]

In [None]:
# Comparing a series to a value gives a list of booleans
capitals['Percentage'] > 25

In [None]:
# So we can use this list of booleans as an index
# To retrieve only the rows for which the comparison is True
capitals[capitals['Percentage'] > 25]

In [None]:
grades = pd.DataFrame([[6, 4], [7, 8], [6, 7], [6, 5], [5, 2]], 
                       index = ['Mary', 'John', 'Ann', 'Pete', 'Laura'],
                       columns = ['test_1', 'test_2'])
grades

In [None]:
# We can compare across columns
grades['test_2'] <= grades['test_1']

In [None]:
# And again, use that as input for the index operator
grades[grades['test_2'] <= grades['test_1']]

In [None]:
# And you can use lists of booleans with loc and iloc too
grades.loc[:, grades.mean() > 5.5]

# Assigning values

In [None]:
grades

In [None]:
# We can assign new values to the data we have selected with an index
grades.loc[['Laura', 'John'], 'test_2'] += 1

In [None]:
grades

In [None]:
# Updating an entire column
grades['test_1'] += .5
grades

In [None]:
# Or an entire row
grades.loc['Mary'] += 2
grades

In [None]:
# Or setting multiple values at once
grades.loc['Pete'] = [7,8]
grades

In [None]:
# If necessary, we first save the boolean lists to variables
failing = grades < 6
passing = grades >= 6
grades[failing] = "Fail"
grades[passing] = "Pass"
grades

In [None]:
grades = pd.DataFrame([[6, 4], [7, 8], [6, 7], [6, 5], [5, 2]], 
                      index = ['Mary', 'John', 'Ann', 'Pete', 'Laura'],
                      columns = ['test_1', 'test_2'])
grades

In [None]:
# Creating a new column is simple
grades['passed'] = grades.mean(axis=1) > 6

In [None]:
grades

In [None]:
# But watch out - this creates a column by mistake!
grades['Ann', 'test_2'] = 8
grades

In [None]:
# This is a better way to do this
grades.loc['Ann', 'test_2'] = 8
grades

In [None]:
# This is called 'chained indexing' and assignment is not guarantueed to work
# Using loc is preferred
grades['test_2']['Ann'] = 9

# Sorting

In [None]:
capitals

In [None]:
# Sort by index
# This returns a copy of the data
capitals.sort_index()

In [None]:
# To change the original data: use inplace=True
# To sort in reverse, use ascending=False
capitals.sort_index(inplace=True, ascending=False)
capitals

In [None]:
# To sort rows instead of columns, use axis=1
capitals.sort_index(axis=1)

In [None]:
# sort_values sorts by data values
# It needs the argument 'by': which column to sort by
capitals.sort_values(by='Percentage')

In [None]:
# Or you can sort by multiple columns
grades.sort_values(by=['test_1', 'test_2'])