In [16]:
import pandas as pd
import numpy as np
import re
import datetime

## Overview
In this notebook we examine the intersection between words that debuted in the NYT crossword in the last five years 
1. words added to the OED in the last four years
2. the Urban Dictionary word of the day archive

In [6]:
cw = pd.read_csv('cw_full.csv').drop('Unnamed: 0', axis=1)

print(cw.shape)
cw.head()

(127956, 8)


Unnamed: 0,title,clue,date,year,answer,unique,day,answer_count
0,,"Join, as two pieces of metal by application of...",December 31,2016,SPOTWELD,False,Saturday,1
1,,Scrap,December 31,2016,SPAT,False,Saturday,11
2,,Preservers of plant specimens,December 31,2016,HERBARIA,False,Saturday,1
3,,Glaring,December 31,2016,HARSH,False,Saturday,6
4,,People on the wrong end of a landslide,December 31,2016,ALSORANS,False,Saturday,3


In [10]:
oed['word'] = oed['word'].map(lambda x: x.lower())
cw['answer'] = cw['answer'].map(lambda x: x.lower())

cw_unique = cw[cw['unique'] == True]

cw_unique

Unnamed: 0,title,clue,date,year,answer,unique,day,answer_count
9,,Exposed part of a deal,December 31,2016,upcard,True,Saturday,1
13,,Cosplay and fanfic are parts of it,December 31,2016,nerdculture,True,Saturday,1
17,,Hypothetical miracle material,December 31,2016,unobtainium,True,Saturday,1
19,,Strips to pieces?,December 31,2016,baconbits,True,Saturday,1
40,,Publishing order,December 31,2016,printrun,True,Saturday,1
41,,An emoticon is a simple form of it,December 31,2016,asciiart,True,Saturday,1
42,,Chain of seven countries,December 31,2016,theandes,True,Saturday,1
66,,One may hold a ship in place,December 23,2016,tractorbeam,True,Friday,1
68,,Visa option,December 23,2016,cashadvance,True,Friday,1
119,,Upset,December 23,2016,knockover,True,Friday,1


In [12]:
cw_unique['answer'].isin(['brexit']).any()

True

In [3]:
oed = pd.read_csv('oed_3.csv')
print oed.shape
oed.head()

(3470, 5)


Unnamed: 0,entry_type,year,part_of_speech,word,month
0,new word entry,2014,n.4,case,September
1,new word entry,2014,adv. and adj.,case-by-case,September
2,new word entry,2014,adj.2,caseless,September
3,new word entry,2014,n.,caseness,September
4,new word entry,2014,n.1,caseworker,September


In [4]:
oed['entry_type']=oed['entry_type'].map(str)
oed['word']=oed['word'].map(str)
oed['month']=oed['month'].map(str)
oed['year']=oed['year'].map(str)
oed['part_of_speech']=oed['part_of_speech'].map(str)

In [14]:
# OED intersection with non-debut crossword answers
common_words = set(oed['word']).intersection(cw['answer'])
print(len(common_words))
common_words

79


{'arr',
 'autotune',
 'beauty',
 'belgium',
 'best',
 'buff',
 'bugs',
 'burgess',
 'carnap',
 'case',
 'catty',
 'chi',
 'chica',
 'cis',
 'commit',
 'coot',
 'dap',
 'dayton',
 'disc',
 'disco',
 'earworm',
 'elegant',
 'emoji',
 'eris',
 'fava',
 'flexagon',
 'gday',
 'gram',
 'gramps',
 'grams',
 'guac',
 'h2o',
 'hap',
 'hash',
 'hatter',
 'he',
 'heal',
 'hex',
 'his',
 'hoverboard',
 'ile',
 'isa',
 'javascript',
 'lac',
 'lasting',
 'lede',
 'locavore',
 'meh',
 'netbook',
 'osha',
 'penna',
 'photobomb',
 'queso',
 'resounded',
 'rested',
 'retweet',
 'sanger',
 'scene',
 'selfie',
 'sext',
 'ship',
 'skort',
 'skype',
 'sledge',
 'stank',
 'staycation',
 'storyboard',
 'structural',
 'stylo',
 'telly',
 'termite',
 'totes',
 'truly',
 'twerk',
 'unobtainium',
 'vape',
 'weekdays',
 'whit',
 'wuss'}

In [15]:
# OED intersection with debut crossword answers only
common_words = set(oed['word']).intersection(cw_unique['answer'])
print(len(common_words))
common_words

common_df = pd.merge(left=cw_unique, right=oed, how='inner', left_on='answer', 
                     right_on='word' )[['word', 'date', 'year_x', 'month', 'year_y']]
common_df.drop_duplicates()

14


Unnamed: 0,word,date,year_x,month,year_y
0,unobtainium,December 31,2016,December,2014
3,flexagon,February 5,2017,June,2014
6,vape,November 2,2016,June,2015
9,hoverboard,May 23,2016,September,2015
12,earworm,October 1,2015,March,2015
15,photobomb,February 28,2015,June,2015
21,selfie,January 23,2015,June,2014
24,locavore,November 21,2014,December,2015
27,emoji,August 17,2014,December,2013
28,javascript,June 6,2014,June,2015


In [17]:
def monthToNum(mon):
    return{
        'January' : 1,
        'February' : 2,
        'March' : 3,
        'April' : 4,
        'May' : 5,
        'June' : 6,
        'July' : 7,
        'August' : 8,
        'September' : 9, 
        'October' : 10,
        'November' : 11,
        'December' : 12
    }[mon]

def oed_date(x):
    return datetime.date(int(x['year_y']), monthToNum(x['month']), 1,)

def cw_date(x):
    space, mon, day = re.split(' ', x['date'].rstrip())
    return datetime.date(int(x['year_x']), monthToNum(mon), 1)

def faster(x):
    if x['oed_date'] < x['cw_date']:
        return 'OED first'
    else:
        return 'CW first'

def find_start(x):
    if x['oed_date'] < x['cw_date']:
        return x['oed_date']
    else:
        return x['cw_date']
    
def find_finish(x):
    if x['oed_date'] >= x['cw_date']:
        return x['oed_date']
    else:
        return x['cw_date']

In [18]:
common_df['oed_date']=common_df.apply(oed_date, axis=1)
common_df['cw_date']=common_df.apply(cw_date, axis=1)
common_df['first_to_use']=common_df.apply(faster, axis=1)
common_df['Start']=common_df.apply(find_start, axis=1)
common_df['Finish']=common_df.apply(find_finish, axis=1)
common_df.drop_duplicates()


Unnamed: 0,word,date,year_x,month,year_y,oed_date,cw_date,first_to_use,Start,Finish
0,unobtainium,December 31,2016,December,2014,2014-12-01,2016-12-01,OED first,2014-12-01,2016-12-01
3,flexagon,February 5,2017,June,2014,2014-06-01,2017-02-01,OED first,2014-06-01,2017-02-01
6,vape,November 2,2016,June,2015,2015-06-01,2016-11-01,OED first,2015-06-01,2016-11-01
9,hoverboard,May 23,2016,September,2015,2015-09-01,2016-05-01,OED first,2015-09-01,2016-05-01
12,earworm,October 1,2015,March,2015,2015-03-01,2015-10-01,OED first,2015-03-01,2015-10-01
15,photobomb,February 28,2015,June,2015,2015-06-01,2015-02-01,CW first,2015-02-01,2015-06-01
21,selfie,January 23,2015,June,2014,2014-06-01,2015-01-01,OED first,2014-06-01,2015-01-01
24,locavore,November 21,2014,December,2015,2015-12-01,2014-11-01,CW first,2014-11-01,2015-12-01
27,emoji,August 17,2014,December,2013,2013-12-01,2014-08-01,OED first,2013-12-01,2014-08-01
28,javascript,June 6,2014,June,2015,2015-06-01,2014-06-01,CW first,2014-06-01,2015-06-01


In [19]:
df = common_df.drop_duplicates()[['word', 'Start', 'Finish', 'first_to_use']]
df.columns = ['Task', 'Start', 'Finish', 'Resource']
df.head()

Unnamed: 0,Task,Start,Finish,Resource
0,unobtainium,2014-12-01,2016-12-01,OED first
3,flexagon,2014-06-01,2017-02-01,OED first
6,vape,2015-06-01,2016-11-01,OED first
9,hoverboard,2015-09-01,2016-05-01,OED first
12,earworm,2015-03-01,2015-10-01,OED first


In [20]:
import plotly.plotly as py
import plotly.figure_factory as ff

df_gantt = [dict(Task=x[1][0], Start=x[1][1], Finish=x[1][2], Resource=x[1][3]) for x in df.iterrows()]

fig = ff.create_gantt(df_gantt, index_col='Resource', title='Time Between NYT CW and OED Debuts', 
                      show_colorbar=True, showgrid_x=True, showgrid_y=True)
py.iplot(fig, filename='gantt-simple-gantt-chart', world_readable=True)

In [21]:
ud = pd.read_csv('ud_3.csv')
ud = ud.dropna()
ud['phrase']=ud['phrase'].map(str)

oed['word']=oed['word'].map(lambda x: x.lower())
ud['phrase']=ud['phrase'].map(lambda x: x.lower())
ud.head()

Unnamed: 0,submit_date,down_votes,up_votes,meaning,date,phrase
0,"May 01, 2003",1612,1717,another word for tight,May 2,fye
1,"December 12, 2003",792,3728,forciby volunteered. A task that was once vol...,Apr 11,voluntold
2,"February 05, 2010",592,1866,When you inadvertently reach for a beer that d...,Apr 18,ghost beer
3,"March 26, 2017",2665,1993,Unapologetic un-PC alpha male conservative fan...,Mar 28,breitbastard
4,"January 24, 2017",862,1676,To masturbate at work while on the clock.,May 1,beat out a paycheck


In [24]:
common_words = set(cw_unique['answer']).intersection(ud['phrase'])
print len(common_words)
print common_words

35
set(['besties', 'instagram', 'mcdreamy', 'lifehack', 'selfie', 'locavore', 'cyber', 'humblebrag', 'overshare', 'shaka', 'dubstep', 'adhd', 'foodie', 'kidult', 'frenemy', 'photobomb', 'mansplain', 'twerk', 'vuvuzela', 'biodome', 'sexile', 'showmance', 'facepalm', 'snarky', 'srsly', 'deets', 'earworm', 'swole', 'janky', 'unobtainium', 'obamania', 'bae', 'haterade', 'nsfw', 'potus'])


In [25]:
common_df = pd.merge(left=cw_unique, right=ud, how='inner', left_on='answer', 
                     right_on='phrase')[['answer', 'date_x', 'year', 'submit_date']]
common_df.drop_duplicates()
common_df.columns = ['word', 'xword_date', 'xword_year', 'ud_submit_date']
common_df

Unnamed: 0,word,xword_date,xword_year,ud_submit_date
0,unobtainium,December 31,2016,"August 16, 2003"
1,showmance,December 19,2016,"July 17, 2006"
2,swole,May 7,2017,"April 14, 2004"
3,janky,February 19,2017,"March 25, 2003"
4,besties,February 4,2017,"October 19, 2004"
5,sexile,February 5,2017,"February 25, 2003"
6,bae,February 1,2017,"August 11, 2006"
7,lifehack,October 28,2016,"December 06, 2005"
8,dubstep,April 15,2016,"August 23, 2010"
9,deets,June 26,2016,"September 30, 2004"
