# Creating Dataset
This notebook selects female poets from the [Poetry Foundation Dataset (Kaggle)](https://www.kaggle.com/tgdivy/poetry-foundation-poems) and creates visualizations and data analysis on the text.

In [1]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

## Loading Dataset

In [2]:
df_raw = pd.read_csv('PoetryFoundationData.csv')

df_poets = pd.read_csv('all_poets_gender - final.csv')

print(f"shape of dataframe: {df_raw.shape}")
df_raw.head()

shape of dataframe: (13854, 5)


Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,


## Selecting List of Poets

In [3]:
all_poets = list(set(df_raw['Poet']))
print('The dataset has {number} poets'.format(number = len(all_poets)))
all_poets

The dataset has 3128 poets


['Rachel Richardson',
 'Essa Ranapiri',
 'elena minor',
 'Betty Adcock',
 'Mary Biddinger',
 'Laura Moriarty',
 'John Haines',
 'Nuala Ní Dhomhnaill',
 'Ruby Robinson',
 'Anne Caston',
 'Andrew Shields',
 'Sharon Bryan',
 'W. S. Merwin',
 'May Sarton',
 'Liz Howard',
 'János Pilinszky',
 'Tyler Ford',
 'Liz Countryman',
 'Peter Gizzi',
 'Alec Brock Stevenson',
 'Christopher Todd Matthews',
 'Miri Ben-Simhon',
 'Dahlia Ravikovitch',
 'Khadijah Queen',
 'John Skelton',
 'David Black',
 'Jackson Mac Low',
 'Jane Huffman',
 'David Shook',
 'Sadiqa de Meijer',
 'Ernst Jandl',
 'Freya Manfred',
 'Charles Lamb',
 'Nathaniel Mackey',
 'Rusty Morrison',
 'Henry Newbolt',
 'Derek Walcott',
 'Peter Cooley',
 'Nancy Simpson',
 'Henry Dumas',
 'Jim Harrison',
 'Colin Cheney',
 'Aaron Shurin',
 'Alfred, Lord Tennyson',
 'Maudelle Driskell',
 'Chris Forhan',
 'Adonis',
 'Sheryl Luna',
 'Henry Gould',
 'Jennifer Tonge',
 'Wilfred Owen',
 'Rabindranath Tagore',
 'Jason Labbe',
 'Charlotte Smith',
 'Eav

In [4]:
with open('all_poets.txt', 'w') as f:
    for poet in all_poets:
        f.write("%s\n" % poet)

### To be continued

In [5]:
df_female_poets = df_poets[(df_poets['gender']=='F') | (df_poets['gender']=='NB') | 
                           (df_poets['gender']=='T')]
df_nonbinary_poets = df_poets[(df_poets['gender']=='NB') | 
                           (df_poets['gender']=='T')]

print(df_female_poets.shape, df_nonbinary_poets.shape)

df_female_poets

(1115, 3) (6, 3)


Unnamed: 0,poet,gender,Unnamed: 2
2,Allison Adelle Hedge Coke,F,
5,Twyla Hansen,F,
13,Lucy Larcom,F,
14,Georgia Douglas Johnson,F,
15,Alli Warren,F,
...,...,...,...
3115,Kathleen Graber,F,
3118,Elizabeth Bentley,F,
3121,Sheena Raza Faisal,F,
3123,Kate Potts,F,


## Merge Tagged Poets Dataset to Poetry Foundation

In [6]:
df_poetry = df_raw.merge(df_female_poets, left_on='Poet', right_on='poet', how='left')
df_poetry.drop(columns=['poet', 'Unnamed: 0', 'Unnamed: 2'], inplace=True)

df_poetry = df_poetry[((df_poetry['gender']=='F') | (df_poetry['gender']=='NB') | 
                           (df_poetry['gender']=='T')) & (~ df_poetry['Tags'].isna())]

df_poetry.head(10)

Unnamed: 0,Title,Poem,Poet,Tags,gender
6,\r\r\n Invisible Fish\r\r\n...,\r\r\nInvisible fish swim this ghost ocean now...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A...",F
7,\r\r\n Don’t Bother the Ear...,\r\r\nDon’t bother the earth spirit who lives ...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa...",F
9,"\r\r\n [""Hour in which I co...","\r\r\nHour in which I consider hydrangea, a sa...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre...",F
18,\r\r\n West of Myself\r\r\n...,\r\r\nWhy are you still seventeen\r\r\nand dri...,Debora Greger,Coming of Age,F
19,\r\r\n Yes\r\r\n ...,"\r\r\n\r\r\n\r\r\n\r\r\nYes, your childhood no...",Debora Greger,"Coming of Age,Youth",F
21,\r\r\n History\r\r\n ...,"\r\r\nOf course wars, of course lice, of cours...",Barbara Ras,"History & Politics,War & Conflict",F
22,\r\r\n What It Was Like\r\r...,"\r\r\nIf they ask what it was like, say it was...",Barbara Ras,Money & Economics,F
23,\r\r\n All\r\r\n ...,"\r\r\nThe prisoner can’t go any longer, but he...",Barbara Ras,"Life Choices,Faith & Doubt",F
24,\r\r\n Sleeping with Butler...,\r\r\n\r\r\n\r\r\n\r\r\nAfter Octavio Paz\r\r\...,Eugene Gloria,"Language & Linguistics,Poetry & Poets",F
25,\r\r\n Hoodlum Birds\r\r\n ...,\r\r\nThe fearless blackbirds see me again\r\r...,Eugene Gloria,Animals,F


In [7]:
df_poetry.isna().sum()

Title     0
Poem      0
Poet      0
Tags      0
gender    0
dtype: int64

In [40]:
poetry_tags = df_poetry['Tags'].map(lambda x: x.split(',')).explode().value_counts()
poetry_tags = poetry_tags.where(lambda x: x >20).dropna()

poetry_tags = poetry_tags.to_frame()

print(len(poetry_tags), poetry_tags.index)

poetry_tags.head(73)

91 91 Index(['Living', 'Relationships', 'Social Commentaries', 'Nature',
       'Arts & Sciences', 'Love', 'Activities', 'Family & Ancestors',
       'The Body', 'Time & Brevity', 'Life Choices', 'History & Politics',
       'Death', 'Religion', 'Animals', 'The Mind', 'Gender & Sexuality',
       'Sorrow & Grieving', 'Men & Women', 'Desire', 'Poetry & Poets',
       'War & Conflict', 'Travels & Journeys', 'Race & Ethnicity', 'Home Life',
       'Philosophy', 'Trees & Flowers', 'Parenthood', 'Landscapes & Pastorals',
       'Realistic & Complicated', 'Mythology & Folklore', 'Seas', ' & Streams',
       ' Rivers', 'Cities & Urban Life', 'Language & Linguistics',
       'Disappointment & Failure', 'Christianity', 'Growing Old',
       'The Spiritual', 'Romantic Love', 'Health & Illness',
       'Eating & Drinking', 'Reading & Books', 'Marriage & Companionship',
       'Heartache & Loss', 'Coming of Age', 'Jobs & Working',
       'Friends & Enemies', 'God & the Divine', 'Class', 'Music',
 

Unnamed: 0,Tags
Living,1967.0
Relationships,1247.0
Social Commentaries,1241.0
Nature,1100.0
Arts & Sciences,853.0
...,...
Crime & Punishment,59.0
Infatuation & Crushes,58.0
Sports & Outdoor Activities,53.0
Separation & Divorce,51.0


## Filtering the Categories

In [52]:
variables = poetry_tags.index

tags_keep = ['Arts & Sciences', 
                    'Love',
                    'Death',
                    'The Mind', 
                    'Gender & Sexuality',
                    'Sorrow & Grieving', 
                    'Desire', 
                    'Race & Ethnicity',
                    'Parenthood', 
                    'Landscapes & Pastorals',
                    'Realistic & Complicated', 
                    'Cities & Urban Life', 
                    'Friends & Enemies', 
                    'Humor & Satire', 'Pets',
                    'Fairy-tales & Legends', 
                    'Gay']


tags_trash = set(variables) - set(tags_keep)
df_poetry_clean = poetry_tags.drop(tags_trash, axis=0)
df_poetry_clean.rename({'Gay': 'LGBTQ'}, axis=0, inplace = True)
df_poetry_clean

Unnamed: 0,Tags
Living,1967.0
Relationships,1247.0
Social Commentaries,1241.0
Nature,1100.0
Arts & Sciences,853.0
Love,703.0
Death,407.0
The Mind,321.0
Gender & Sexuality,309.0
Sorrow & Grieving,297.0
