우리들만의 단어 사전 만들기
# https://omicro03.medium.com/%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC-nlp-29%EC%9D%BC%EC%B0%A8-spacy-%EC%86%8C%EA%B0%9C-1b76d1746c6c


In [8]:
!pip install spacy





In [9]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Using cached en_core_web_sm-2.3.1-py3-none-any.whl
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')




In [10]:
import spacy

In [14]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [58]:
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [15]:
doc = 'I always uh do the main um processing, I mean, the uh um data-processing.'

In [16]:
stats = nlp(doc)

In [17]:
stats, type(stats)

(I always uh do the main um processing, I mean, the uh um data-processing.,
 spacy.tokens.doc.Doc)

In [18]:
for token in stats:
    print(token.text, type(token.text))

I <class 'str'>
always <class 'str'>
uh <class 'str'>
do <class 'str'>
the <class 'str'>
main <class 'str'>
um <class 'str'>
processing <class 'str'>
, <class 'str'>
I <class 'str'>
mean <class 'str'>
, <class 'str'>
the <class 'str'>
uh <class 'str'>
um <class 'str'>
data <class 'str'>
- <class 'str'>
processing <class 'str'>
. <class 'str'>


In [19]:
doc2 = "Korea has a reasonable population."
stats = nlp(doc2)
for token in stats:
    print(token.text)

Korea
has
a
reasonable
population
.


In [20]:
doc

'I always uh do the main um processing, I mean, the uh um data-processing.'

In [21]:
for token in stats:
    print(token.text)

Korea
has
a
reasonable
population
.


In [22]:
import re
for token in re.split("\W+", doc):
    print(token)

I
always
uh
do
the
main
um
processing
I
mean
the
uh
um
data
processing



In [23]:
import re

doc2 = "U.S.A. has a reasonable population."
for token in re.split("\W+", doc2):
    print(token)

U
S
A
has
a
reasonable
population



In [None]:
stats = nlp(doc2)
for token in stats:
    print(token.text)

U.S.A.
has
a
reasonable
population
.


In [24]:
listVocabulary = list(nlp.vocab.strings)

In [25]:
len(listVocabulary), type(listVocabulary)

(1180, list)

In [26]:
listVocabulary[:10]

['""', '#', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD']

In [51]:
listVocabulary[1000]

'^__^'

In [52]:
listVocabulary[900]

'ph.d.'

In [29]:
import numpy as np
import pandas as pd
from collections import Counter
import re

In [30]:
df = pd.read_csv("reviews.csv")

In [31]:
df.head()

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [32]:
df.tail()

Unnamed: 0,rating,review
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...
55999,positive,where else can you find all the parts and piec...


In [33]:
df.shape

(56000, 2)

In [34]:
vocab = {}

# 단어 사전 만들기 위한 함수

In [35]:
def initializeVocabulary():
    unkToken = "<UNK>"
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    vocab["unkToken"] = unkToken
    idx = addToken(unkToken)
    vocab["unkTokenIdx"] = idx

In [36]:
def addToken(token):
    if token in vocab["t_2_i"]:
        idx = vocab["t_2_i"][token]
    else:
        idx = len(vocab["t_2_i"])
        vocab["t_2_i"][token] = idx
        vocab["i_2_t"][idx] = token
    return idx

In [37]:
def addManyTokens(tokens):
    idxes = [addToken(token) for token in tokens]
    return idxes

In [38]:
def lookUpToken(token):
    if vocab["unkTokenIdx"] >= 0:
        return vocab["t_2_i"].get(token, vocab["unkTokenIdx"])
    else:
        return vocab["t_2_i"][token]

In [39]:
def lookUpIndex(idx):
    if idx not in vocab["i_2_t"]:
        raise KeyError("the index (%d) is not there" % idx)
    return vocab["i_2_t"][idx]

In [40]:
def vocabularyFromDataFrame(df, cutoff = 25):
    initializeVocabulary()
    wordCounts = Counter()
    for r in df.review:
        for word in re.split("\W+", r):
            wordCounts[word] += 1
    for word, count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [41]:
df = pd.read_csv("reviews.csv")

In [42]:
vocabularyFromDataFrame(df)

In [43]:
type(vocab), len(vocab)

(dict, 4)

In [44]:
vocab.keys()

dict_keys(['t_2_i', 'i_2_t', 'unkToken', 'unkTokenIdx'])

In [45]:
type(vocab['t_2_i']), type(vocab['i_2_t']), type(vocab['unkToken']), type(vocab['unkTokenIdx']) 

(dict, dict, str, int)

In [46]:
vocab['unkToken'], vocab['unkTokenIdx']

('<UNK>', 0)

In [47]:
vocab['t_2_i']

{'<UNK>': 0,
 'terrible': 1,
 'place': 2,
 'to': 3,
 'work': 4,
 'for': 5,
 'i': 6,
 'just': 7,
 'heard': 8,
 'a': 9,
 'story': 10,
 'of': 11,
 'them': 12,
 'find': 13,
 'girl': 14,
 'over': 15,
 'her': 16,
 'father': 17,
 'coming': 18,
 'in': 19,
 'there': 20,
 'who': 21,
 'she': 22,
 'hadn': 23,
 't': 24,
 'seen': 25,
 'years': 26,
 'said': 27,
 'hi': 28,
 'him': 29,
 'which': 30,
 'upset': 31,
 'his': 32,
 'wife': 33,
 'and': 34,
 'they': 35,
 'left': 36,
 'finished': 37,
 'the': 38,
 'rest': 39,
 'day': 40,
 'working': 41,
 'fine': 42,
 'next': 43,
 'when': 44,
 'went': 45,
 'into': 46,
 'fired': 47,
 'that': 48,
 'situation': 49,
 'one': 50,
 'texas': 51,
 'roadhouse': 52,
 'because': 53,
 'any': 54,
 'could': 55,
 'be': 56,
 'their': 57,
 'staff': 58,
 'does': 59,
 'not': 60,
 'deserve': 61,
 'my': 62,
 'business': 63,
 'yelp': 64,
 'wants': 65,
 'me': 66,
 'give': 67,
 'star': 68,
 'but': 69,
 'don': 70,
 'believe': 71,
 'it': 72,
 '': 73,
 'hours': 74,
 'minutes': 75,
 'total':

In [48]:
lookUpToken('terrible')

1

In [49]:
lookUpIndex(1)

'terrible'