# Wrangling raw data


In [1]:
import pandas as pd
from langdetect import detect_langs
import matplotlib.pyplot as plt
import seaborn as sns
import re
raw_data_dir= '../data/raw/'
raw_data_name ='kaggle_poem_dataset.csv'


In [2]:
df = pd.read_csv(raw_data_dir+raw_data_name, index_col=0)
df = df.loc[0:]# Get rid of introduction coloumn
df.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True) # clean column names

In [3]:
df.head()

Unnamed: 0,author,title,poetry_foundation_id,content
0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [4]:
# Statistics of how many poems from different authores there are
df.author.value_counts()[(df.author.value_counts()>1)].describe() 

count    2081.000000
mean        6.930802
std         7.258418
min         2.000000
25%         3.000000
50%         4.000000
75%         8.000000
max        85.000000
Name: author, dtype: float64

In [5]:
authors =  df.author.value_counts()[(df.author.value_counts()>1)]

## Cleaning


In [6]:
# As a rule of thump we stay only with poems which are at least have 100 words
df['length_in_words'] = (df.content.apply(lambda x: len(x.split())) ) 
MIN_WORD_LENGTH= 100
df = df[df.length_in_words>=MIN_WORD_LENGTH]

In [7]:
# And I keeping only those written in english

In [8]:
df['languages'] = df.content.apply(lambda x: detect_langs(x))

In [9]:
# First let us get of those written in two languages
df = df[df.languages.apply(lambda x: len(x))==1]

In [10]:
# Now let us get rid of those not written in english
df= df[df.languages.apply(lambda x: str(x[0])[:2])=='en']

In [11]:
#Language detection is not super reliable
# Manual inspection is still necessary
detect_langs(df.loc[2324].content[0:100]), df.loc[2324].content[0:100]

([en:0.8571392779821672, no:0.1428567563284033],
 'Clannesse who so kyndly cowþe comende\n&amp; rekken vp alle þe resounz þat ho by ri\n3\nt askez,\nFayre ')

In [12]:
df[(df.title=='Patience') & (df.author=='Anonymous')].content

10174    Pacience is a poynt, Þa\n3\nhit displese ofte....
Name: content, dtype: object

In [13]:
df = df.drop(index=[2324,10174])

In [14]:
df.head()

Unnamed: 0,author,title,poetry_foundation_id,content,length_in_words,languages
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...,113,[en:0.9999981229687585]
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...,150,[en:0.9999972500509958]
5,Joe Brainard,30 One-Liners,58251,WINTER\nMore time is spent at the window.\n\nS...,356,[en:0.9999978461197498]
7,Alice Notley,30th Birthday,48037,May I never be afraid\nespecially of myself\nb...,129,[en:0.999997299851064]
8,Charles Bukowski,a 340 dollar horse and a hundred dollar whore,49569,don’t ever get the idea I am a poet; you can s...,360,[en:0.9999965596675678]


In [15]:
df = df.drop(columns='languages')

In [16]:
def fixing_apostrophes(x):
    
    remove_special = x.replace('\xa0', ' ').replace("&amp", "").replace('...','.')
    
    reintroduce_e= remove_special.replace("’d", "ed").replace("’n", "en").replace("’ve","'ve")
    reintroduce_e= reintroduce_e.replace("'d", "ed").replace("'n", "en")
    
    
    uniform_apostrophes= reintroduce_e.replace("’s", "'s").replace("’ll", "'ll")
    
    fixing= uniform_apostrophes.replace("Ied", "I'd")
    
    fixing= fixing.replace("youed", "you'd")
    fixing= fixing.replace("Youed", "You'd")
    
    fixing= fixing.replace("weed", "we'd")
    fixing= fixing.replace("Weed", "You'd")
    
    fixing= fixing.replace("heed", "he'd")
    fixing= fixing.replace("Heed", "He'd")
    
    fixing= fixing.replace("sheed", "she'd")
    fixing= fixing.replace("Sheed", "She'd")
    
    fixing= fixing.replace("theyed", "they'd")
    fixing= fixing.replace("Theyed", "They'd")

    
    
    return fixing

In [17]:
df['content'] =  df.content.apply(fixing_apostrophes)

In [18]:

def wspace_schars(review, chars_to_keep="" , no_white_space = True, no_newlines= True):
    """
    Function to formar expressions
    """
    
    to_keep= ""
    for i in chars_to_keep:
        to_keep+= i+'|'
   
    rep_special_chars= re.compile("[^\w\n|"+ (to_keep[:-1])+ "]|_") 
    
    text=rep_special_chars.sub(' ', review) # Subs special charas by white space except chars_to_keep
    if no_white_space:
        text = re.sub('\n+', '\n',text) # Remove consecutive breaklines
    if no_newlines:
        text = re.sub(' +', ' ',text) # Remove consecutive white space
    return text

def clean_titles(title):
    title = re.sub('[^A-Za-z0-9.\d\s]+', '', title)
    title = title.lower().replace(' ', '_').replace('\n','')
    return title

In [19]:
df['title']=df['title'].apply(wspace_schars).apply(clean_titles)
df['author']=df['author'].apply(wspace_schars).apply(clean_titles)

In [20]:
df[df.title=='sir_gawain_and_the_green_knight']

Unnamed: 0,author,title,poetry_foundation_id,content,length_in_words
12067,anonymous,sir_gawain_and_the_green_knight,43562,siþen þe sege and þe assaut watz sesed at troy...,10481


In [21]:
# The following poems are neglected because its poems are not properly decoded. 
df = df[df.title!='sir_gawain_and_the_green_knight']
df = df[df.author!='geoffrey_chaucer']

In [22]:
# John Milton has many contributions, but most are 1600's eddition. 
df = df[df.author!= 'john_milton']

In [23]:
# From the anonimous poems, only beowulf is long enough to 
#be considered, and we keeep the moder english translation only
beowulf = df[(df.title == 'beowulf_modern_english_translation_')]
df = df[df.author!='anonymous']
df = df.append(beowulf)

In [24]:
wrangled_data_dir = '../data/wrangled/'
wrangled_data_name = 'wrangled_data.csv'
df.to_csv(wrangled_data_dir + wrangled_data_name)

## TODO (future improvements):

- Use a robust package to detect languages. 