In [70]:
import json
import numpy as np
from pprint import pprint
import pandas as pd
import os
from vocabulary.vocabulary import Vocabulary

In [5]:
DATA_DIR = '/Users/bobdavis/data'

In [6]:
os.listdir(DATA_DIR)

['.DS_Store',
 'kaggle',
 'BasicEnglish.csv',
 'scikit_learn_data',
 'abalone.data',
 'countries']

In [44]:
vocab = pd.read_csv(os.path.join(DATA_DIR, 'BasicEnglish.csv'))
vocab.head()

Unnamed: 0,category,word
0,things,account
1,things,act
2,things,addition
3,things,adjustment
4,things,advertisement


In [125]:
vocab.sample(10)

Unnamed: 0,category,word
561,pictorial,spade
117,things,fiction
59,things,copy
29,things,blow
96,things,driving
16,things,attempt
154,things,increase
308,things,sign
460,pictorial,drain
410,pictorial,basin


In [15]:
methods = [ attr for attr in dir(Vocabulary) if not attr.startswith("_") and attr != "translate"]
methods

['antonym',
 'hyphenation',
 'meaning',
 'part_of_speech',
 'pronunciation',
 'synonym',
 'usage_example']

In [36]:
def pull_word_definition(word, verbose=False):
    word_detail = {}
    word_detail['spelling'] = word
    for method in methods:
        if verbose:
            print("\n------")
            print(method)
            print("-------")
        res = getattr(Vocabulary, method)(word_detail['spelling'])
        try:
            data = json.loads(res)
        except:
            data = res
        word_detail[method] = data
        if verbose: pprint(data)
    return word_detail
word_info = pull_word_definition("addition", True)


------
antonym
-------
[{'seq': 0, 'text': 'subtraction'}]

------
hyphenation
-------
[{'seq': 0, 'text': 'ad'},
 {'seq': 1, 'text': 'di', 'type': 'stress'},
 {'seq': 2, 'text': 'tion'}]

------
meaning
-------
[{'seq': 0,
  'text': 'Something added to a coat of arms, as a mark of honour; opposed to '
          '<i>abatement</i>.'},
 {'seq': 1,
  'text': 'something added to what you already have; &quot;the librarian '
          'shelved the new accessions&quot;; &quot;he was a new addition to '
          'the staff&quot;'},
 {'seq': 2,
  'text': '(arithmetic); the mathematical operation of increasing one amount '
          'by another. The result of adding two quantities is their sum.'},
 {'seq': 3,
  'text': 'a quantity that is added; &quot;there was an addition to property '
          'taxes this year&quot;; &quot;they recorded the cattle&#39;s gain in '
          'weight over a period of weeks&quot;'},
 {'seq': 4,
  'text': 'A dot at the right side of a note as an indication that 

In [None]:
#word_detail = pull_word_definition("log")

In [38]:
word_detail = word_info

In [59]:
def try_ex(d, key, seq=0):
    if type(d) == list:
        if len(d) > seq:
            d = d[seq]
        else:
            return
    if type(d) != dict:
        return d
    try:
        return d[key]
    except KeyError:
        return

In [88]:
keys = ['spelling', 'pronunciation', 'meaning', 'part_of_speech', 'synonym', 'usage_example']

In [67]:
def dictionary_to_row(word_detail, seq=0):
    row = []
    for key in keys:
        if key == 'pronunciation':
            subkey = 'raw'
        else:
            subkey = 'text'
        row.append(try_ex(word_detail[key], subkey, seq))
    return row
dictionary_to_row(word_detail, 10)

['advertisement', None, None, None, None, None]

In [75]:
def count_none(mylist):
    return sum([1 if _ is None else 0 for _ in mylist ])
count_none(dictionary_to_row(word_detail, 10))

5

In [126]:
from datetime import datetime
start = datetime.now()
start

datetime.datetime(2018, 6, 17, 16, 12, 22, 205705)

In [127]:
#%%timeit -n1
word_definitions = []
for _, word in vocab.iterrows():
    if _ > 1000:
        break
    print(word.word)
    word_detail = pull_word_definition(word.word)
    for seq in range(0, 10):
        row_info = dictionary_to_row(word_detail, seq)
        word_definitions.append(row_info + [seq])
        if count_none(row_info) > 4:
            break
df = pd.DataFrame(word_definitions, columns=keys + ['seq'])

account
act
addition
adjustment
advertisement
agreement
air
amount
amusement
animal
answer
apparatus
approval
argument
art
attack
attempt
attention
attraction
authority
back
balance
base
behavior
belief
birth
bit
bite
blood
blow
body
brass
bread
breath
brother
building
burn
burst
business
butter
canvas
care
cause
chalk
chance
change
cloth
coal
color
comfort
committee
company
comparison
competition
condition
connection
control
cook
copper
copy
cork
cotton
cough
country
cover
crack
credit
crime
crush
cry
current
curve
damage
danger
daughter
day
death
debt
decision
degree
design
desire
destruction
detail
development
digestion
direction
discovery
Connection Lost
discussion
disease
disgust
distance
distribution
division
doubt
drink
driving
dust
earth
edge
education
effect
end
error
event
example
exchange
existence
expansion
experience
expert
fact
fall
family
father
fear
feeling
fiction
field
fight
fire
flame
flight
flower
fold
food
force
form
friend
front
fruit
glass
gold
government
grain
g

In [128]:
df.shape

(8498, 7)

In [129]:
df.head()

Unnamed: 0,spelling,pronunciation,meaning,part_of_speech,synonym,usage_example,seq
0,account,(ə-kountˈ),"(transitive, obsolete) To reckon; to compute; ...",noun,think,A. To Account for your actions!,0
1,account,AH0 K AW1 N T,to provide explanation,noun,hold in opinion,,1
2,account,,"to estimate, to deem",noun,esteem,,2
3,account,,(transitive) To get revenge on.,noun,news story,,3
4,account,,To give an account or representation in words.,noun,accountant,,4


In [130]:
df.to_csv(os.path.join(DATA_DIR, 'vocabulary.csv'))

In [131]:
finish = datetime.now()
delta = finish - start
delta.total_seconds()

5738.557974

In [132]:
5738/60

95.63333333333334

In [134]:
df['spelling'].unique().shape

(850,)

In [139]:
df[df['seq'] == 0].sample(10)[['pronunciation', 'meaning']]

Unnamed: 0,pronunciation,meaning
2478,"(pôrˈtər, pōrˈ-)",a person who carries luggage and related objects
4778,(foul),False
8268,(ĭf),False
3218,(snō),False
4668,(ī),False
2998,(sĕns),False
1070,(ĭg-zĭsˈtəns),"The state of being, existing, or occurring."
5688,(stĕm),False
7678,(ə-boutˈ),False
6378,(hăpˈē),False


In [153]:
loud = pull_word_definition('loud')
loud

{'antonym': [{'seq': 0, 'text': 'piano'},
  {'seq': 1, 'text': 'soft'},
  {'seq': 2, 'text': 'softly'}],
 'hyphenation': [{'seq': 0, 'text': 'loud'}],
 'meaning': False,
 'part_of_speech': [{'example': 'Characterized by high volume and intensity. Used of sound.',
   'seq': 0,
   'text': 'adjective'},
  {'example': 'Producing sound of high volume and intensity.',
   'seq': 1,
   'text': 'adjective'},
  {'example': 'Clamorous and insistent:  loud denials. ',
   'seq': 2,
   'text': 'adjective'},
  {'example': 'Having offensively bright colors:  a loud necktie. ',
   'seq': 3,
   'text': 'adjective'},
  {'example': 'Having an offensively strong odor.',
   'seq': 4,
   'text': 'adjective'},
  {'example': 'Offensive in manner.', 'seq': 5, 'text': 'adjective'},
  {'example': 'In a loud manner.', 'seq': 6, 'text': 'adverb'}],
 'pronunciation': [{'raw': '(loud)', 'rawType': 'ahd-legacy', 'seq': 0},
  {'raw': '(loud)', 'rawType': 'gcide-diacritical', 'seq': 1},
  {'raw': 'L AW1 D', 'rawType': '

In [152]:
df[df['spelling']=='loud']

Unnamed: 0,spelling,pronunciation,meaning,part_of_speech,synonym,usage_example,seq
7268,loud,(loud),False,adjective,False,"""Man we blew a whole pack of loud last night d...",0
7269,loud,(loud),False,adjective,False,I'm filling my blunt with some loud.,1
7270,loud,L AW1 D,False,adjective,False,Her shirts were always [loud] & cheerful~looking.,2
7271,loud,,False,adjective,False,"Miguel : You got any ?Michelle : Yeah , i got ...",3
7272,loud,,False,adjective,False,"I got some loud, dawg. I ain't messin with tha...",4
7273,loud,,False,adjective,False,I got that Loud on deck .,5
7274,loud,,False,adverb,False,Example 1. He was smoking that [loud] yesterda...,6
7275,loud,,False,,False,"“Josh is Lou'd out of his mind this morning, a...",7
7276,loud,,False,,False,,8
7277,loud,,False,,False,,9


In [150]:
df[df['meaning'] == False].sample(10)

Unnamed: 0,spelling,pronunciation,meaning,part_of_speech,synonym,usage_example,seq
8183,still,,False,adjective,False,,5
7271,loud,,False,adjective,False,"Miguel : You got any ?Michelle : Yeah , i got ...",3
3712,unit,,False,noun,False,"Ere Bri, you seen that big old Bella Emberg in...",4
4980,kettle,K EH1 T AH0 L,False,noun,False,Airline passenger traveling with too much lugg...,2
6568,open,(ōˈpən),False,adjective,False,"1. I'm definitely gonna smash, that bitch is ...",0
3096,silk,,False,,False,,8
7126,delicate,,False,adjective,False,,8
3949,word,W ER1 D,False,noun,False,"""Yo, I fucked twelve bitches last night.""""Word...",1
3251,son,,False,noun,False,"The Son of God, who is referred as the [Alpha ...",3
6311,frequent,,False,,False,,3
