# Selecting Rows and Columns

- execute command
- using critical thinking, talking to others and any research tools figure out how and what the command does
- make notes in notebook for later reference

### 1. Read Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data02/yob_1880.csv', index_col=0)

### 2. Inspect Data

In [3]:
df

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
...,...,...
Woodie,M,5
Worthy,M,5
Wright,M,5
York,M,5


In [4]:
df.shape # returns number of rows, columns (except for index) in DataFrame

(2000, 2)

In [5]:
df.head(10) # returns first 10 rows in DataFrame

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
Bertha,F,1320
Sarah,F,1288


In [6]:
df.tail(3) # returns last 3 rows in DataFrame

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Wright,M,5
York,M,5
Zachariah,M,5


### 3. Select Columns

In [7]:
df["gender"] # returns specified column 'gender' with index

name
Mary         F
Anna         F
Emma         F
Elizabeth    F
Minnie       F
            ..
Woodie       M
Worthy       M
Wright       M
York         M
Zachariah    M
Name: gender, Length: 2000, dtype: object

In [8]:
col_names = ['gender', 'frequency']
df[col_names].head(3)  # selecs 2 specified rows from DataFrame, only 3 top records

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Anna,F,2604
Emma,F,2003


### 4. Select Rows

In [9]:
df.loc['Anna'] # returns all rows by index_label = 'Anna'

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anna,F,2604
Anna,M,12


In [10]:
df.loc[['Anna', 'John', 'Mary']] # returns all rows with 3 specified index_label values

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anna,F,2604
Anna,M,12
John,F,46
John,M,9655
Mary,F,7065
Mary,M,27


In [11]:
gender = df.set_index('gender') 
# set 'gender' as new index for DataFrame, but save these changes ONLY to NEW DataFrame ('gender')
# it is possible, because set_index() has defaul parameter 'inplace=False'
gender

Unnamed: 0_level_0,frequency
gender,Unnamed: 1_level_1
F,7065
F,2604
F,2003
F,1939
F,1746
...,...
M,5
M,5
M,5
M,5


In [12]:
gender.loc['M'].head(3) ## show top 3 records with index_label = 'M' from 'gender' DataFrame

Unnamed: 0_level_0,frequency
gender,Unnamed: 1_level_1
M,9655
M,9532
M,5927


In [13]:
df.iloc[[1, 3, 5]]  # show 2nd, 4th and 6th rows (its index order number, starting from 0) from initial DataFrame

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anna,F,2604
Elizabeth,F,1939
Margaret,F,1578


In [14]:
df.iloc[1:5]  # select records starting from number 1 (numeration starts at 0) to number 4
# IMPORTANT! Ending record number is not included!

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746


In [15]:
df.iloc[::2]  # select every 2nd row (by number), starting from the 1st (number = 0) and through the all DataFrame

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Emma,F,2003
Minnie,F,1746
Ida,F,1472
Bertha,F,1320
...,...,...
Unknown,M,5
Wes,M,5
Wood,M,5
Worthy,M,5


### 5. Select Both Rows and Columns

In [16]:
df.loc[['Anna', 'John', 'Mary'], ['frequency']] # select 'frequency' column, but only rows with specified index_label values 

Unnamed: 0_level_0,frequency
name,Unnamed: 1_level_1
Anna,2604
Anna,12
John,46
John,9655
Mary,7065
Mary,27


In [17]:
df.iloc[1:5, 0:2] # select row starting from 2nd (numeration starts from 0) till 4th (End Number is not included)
# '0:2' part select columns starting from the 1st (number = 0) till 2nd (number=1), but column with number = 2 is not included
# Index is not a column, so don't have a column_number


Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746


## Solve One-Liners

Using the same dataset from the challenge above solve the following tasks with pandas one-liners:

    1. Read in data:
        df = pd.read_csv('../data/yob_1880.csv', index_col=0)

    2. display the 'frequency' column

    3. display the 'gender' and 'frequency' columns

    4. display the data for row(s) containing William

    5. display the data for all rows with William, Paul, and Anne

    6. display the 'frequency' column for William, Paul, and Anne

    7. display the first three names and both columns

    8. display the both columns for every second name


In [18]:
# 1. Read in data:
df = pd.read_csv('../data02/yob_1880.csv', index_col=0)

In [19]:
# 2. display the 'frequency' column
df[['frequency']]

Unnamed: 0_level_0,frequency
name,Unnamed: 1_level_1
Mary,7065
Anna,2604
Emma,2003
Elizabeth,1939
Minnie,1746
...,...
Woodie,5
Worthy,5
Wright,5
York,5


In [20]:
# 3. display the 'gender' and 'frequency' columns
df[['gender', 'frequency']]

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
...,...,...
Woodie,M,5
Worthy,M,5
Wright,M,5
York,M,5


In [21]:
# 4. display the data for row(s) containing William
df.loc['William']

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
William,F,30
William,M,9532


In [22]:
# 5. display the data for all rows with William, Paul, and Anne
df.loc[['William', 'Paul', 'Anne']]

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
William,F,30
William,M,9532
Paul,M,301
Anne,F,136


In [23]:
# 6. display the 'frequency' column for William, Paul, and Anne
df.loc[['William', 'Paul', 'Anne'], ['frequency']]

Unnamed: 0_level_0,frequency
name,Unnamed: 1_level_1
William,30
William,9532
Paul,301
Anne,136


In [24]:
# 7. display the first three names and both columns
df.head(3)

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Anna,F,2604
Emma,F,2003


In [25]:
# 8. display the both columns for every second name
df.iloc[::2]

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Emma,F,2003
Minnie,F,1746
Ida,F,1472
Bertha,F,1320
...,...,...
Unknown,M,5
Wes,M,5
Wood,M,5
Worthy,M,5


In [26]:
df.head(15) # test info

Unnamed: 0_level_0,gender,frequency
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
Bertha,F,1320
Sarah,F,1288
