## Data Preprocessing Step
#### 1. Tokenization : 문서, 문장, 단어로 분리
#### 2. Stop word elimination
#### 3. Normalization : 문장 부호 제거, 소문자/대문자 변환, 약어 전개
#### 4. Stemming : 단어를 기본형으로 바꾸어준다. 복수형은 단수형으로, 과거형은 현재형으로 바꾸는 과정
#### 4. Representation

### 1-1 Sentence Tokenization

In [2]:
text = """ one paragraph, of 100-250 words, which summarizes the purpose, methods, results and conclusions of the paper.
    It is not easy to include all this information in just a few words. Start by writing a summary that includes whatever you think is important,
    and then gradually prune it down to size by removing unnecessary words, while still retaini ng the necessary concepts.
    Don't use abbreviations or citations in the abstract. It should be able to stand alone without any footnotes. Fig 1.1.1 shows below."""

" one paragraph, of 100-250 words, which summarizes the purpose, methods, results and conclusions of the paper.\n    It is not easy to include all this information in just a few words. Start by writing a summary that includes whatever you think is important,\n    and then gradually prune it down to size by removing unnecessary words, while still retaini ng the necessary concepts.\n    Don't use abbreviations or citations in the abstract. It should be able to stand alone without any footnotes. Fig 1.1.1 shows below."

In [3]:
import nltk
from nltk import sent_tokenize

sent_tokenize(text)

[' one paragraph, of 100-250 words, which summarizes the purpose, methods, results and conclusions of the paper.',
 'It is not easy to include all this information in just a few words.',
 'Start by writing a summary that includes whatever you think is important,\n    and then gradually prune it down to size by removing unnecessary words, while still retaini ng the necessary concepts.',
 "Don't use abbreviations or citations in the abstract.",
 'It should be able to stand alone without any footnotes.',
 'Fig 1.1.1 shows below.']

### 1-2 Word Tokenization

In [4]:
string = "I can't do anything!(Oh, no)"
print(nltk.word_tokenize(string))  # 가장 기본적인 tokenization 함수. space 단위와 구두점(punctuation)을 기준으로 토큰화
print(nltk.regexp_tokenize(string ,"[\w']+")) # 정규표현은 텍스트를 어떻게 토큰화(Tokenize) 할건지에 대해 설정 가능

['I', 'ca', "n't", 'do', 'anything', '!', '(', 'Oh', ',', 'no', ')']
['I', "can't", 'do', 'anything', 'Oh', 'no']


In [5]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(string))

['I', 'ca', "n't", 'do', 'anything', '!', '(', 'Oh', ',', 'no', ')']


In [7]:
from nltk.tokenize import WordPunctTokenizer

tokenizer=WordPunctTokenizer()
print(tokenizer.tokenize(string))

['I', 'can', "'", 't', 'do', 'anything', '!(', 'Oh', ',', 'no', ')']


In [8]:
import re

re.sub('[^a-zA-Z\s]', '', string)

'I cant do anythingOh no'

###  2. Stop words elimination

In [10]:
from nltk.corpus import stopwords

stopWords =list(stopwords.words('english'))
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
add_stopwords = ['im','youre','also','even','dont','namsan','incheon']
stopWords = set(stopWords + add_stopwords)

In [13]:
words = ['I', "can't", 'do', 'anything', 'Oh', 'no']
print([word for word in words if word not in stopWords])

['I', "can't", 'anything', 'Oh']


### 3. Normalization

In [14]:
text='HARdWork IS KEy to SUCCESS'
print(text.lower())
print(text.upper())

hardwork is key to success
HARDWORK IS KEY TO SUCCESS


### 특수문자, 부호 삭제 방법 (1): replace() 메소드 사용법

In [15]:
text = "She go? She go."

text_r = text.replace("?", "")
print(text_r)

text_r1 = text_r.replace(".", "")
print(text_r1)

She go She go.
She go She go


### 특수문자, 부호 삭제 방법 (2): 정규식(re) 모듈 사용법

In [18]:
import re    # 정규식 사용 모듈 re
import string
from nltk.tokenize import word_tokenize

text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]

tokenized_docs=[word_tokenize(doc) for doc in text]

x=re.compile('[%s]' % re.escape(string.punctuation))

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []

    for token in review: 
        new_token = x.sub(u'', token)
        
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['It', 'is', 'a', 'pleasant', 'evening'], ['Guests', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty']]


### 약어 전개 방법: (1) replace() 메소드 사용

In [19]:
text = "She must've gone to the market but she didn't go"

text_r = text.replace("t've", "t have")
text_r = text.replace("n't", "did not") # 왜 여러개의 패턴을 모듈로 생성해 사용하는 것이 필요한지 예시

print(text_r)

She must've gone to the market but she diddid not go


### 약어 전개 방법: (2) 기존에 특정 패턴을 지정해 모듈로 생성하여 재사용 

In [26]:
from replacers import RegexpReplacer

replacer= RegexpReplacer()
print(replacer.replace("She must've gone to the market but she didn't go"))

ModuleNotFoundError: No module named 'replacers'

In [107]:
replacement_patterns = [
	(r'won\'t', 'will not'),
	(r'can\'t', 'cannot'),
	(r'i\'m', 'i am'),
	(r'ain\'t', 'is not'),
	(r'(\w+)\'ll', '\g<1> will'),
	(r'(\w+)n\'t', '\g<1> not'),
	(r'(\w+)\'ve', '\g<1> have'),
	(r'(\w+)\'s', '\g<1> is'),
	(r'(\w+)\'re', '\g<1> are'),
	(r'(\w+)\'d', '\g<1> would'),
]

In [None]:
######### Pos name #############

# CC coordinating conjunction / CD cardinal digit
# DT determiner
# EX existential there (like: “there is” … think of it like “there exists”)
# FW foreign word
# IN preposition/subordinating conjunction
### JJ adjective ‘big’/ JJR adjective, comparative ‘bigger’/ JJS adjective, superlative ‘biggest’
# LS list marker 1)
# MD modal could, will
### NN noun, singular ‘desk’NNS noun plural ‘desks’/ NNP proper noun, singular ‘Harrison’/ NNPS proper noun, plural ‘Americans’
# PDT predeterminer ‘all the kids’
# POS possessive ending parent’s
# PRP personal pronoun I, he, she / PRP$ possessive pronoun my, his, hers
### RB adverb very, silently / RBR adverb, comparative better / RBS adverb, superlative best
# RP particle give up
# TO, to go ‘to’ the store.
# UH interjection, errrrrrrrm
### VB verb, base form take / VBD verb, past tense took / VBG verb, gerund present participle taking / VBN verb, past participle taken / VBP verb, sing. present, non-3d take / VBZ verb, 3rd person sing. present takes
# WDT wh-determiner which / WP wh-pronoun who, what / WP$ possessive wh-pronoun whose / WRB wh-abverb where, when

### 명사, 형용사, 동사, 부사 lemmatization

In [28]:
lemm = WordNetLemmatizer()

def njvr_lemmantizer(sent):
    global lemm
    
    lemm_sent = []
    for word_pos in sent:
        word, pos = word_pos
        if pos[0] == 'N':
            lemm_sent.append(lemm.lemmatize(word,pos='n').lower() +'_N')
        elif pos[0] == 'J':
            lemm_sent.append(lemm.lemmatize(word,pos='a').lower() +'_J')
        elif pos[0] == 'V':
            lemm_sent.append(lemm.lemmatize(word,pos='v').lower() +'_V')
        elif pos[0] == 'R':
            lemm_sent.append(lemm.lemmatize(word,pos='r').lower() +'_R')
        else:pass
    return lemm_sent

In [46]:
lemm = WordNetLemmatizer()

def njvr_lemmantizer2(sent):
    global lemm
    
    lemm_sent = []
    for word_pos in sent:
        word, pos = word_pos
        if pos[0] == 'N':
            lemm_sent.append(lemm.lemmatize(word,pos='n').lower())
        elif pos[0] == 'J':
            lemm_sent.append(lemm.lemmatize(word,pos='a').lower())
        elif pos[0] == 'V':
            lemm_sent.append(lemm.lemmatize(word,pos='v').lower())
        elif pos[0] == 'R':
            lemm_sent.append(lemm.lemmatize(word,pos='r').lower())
        else:pass
    return lemm_sent

In [30]:
review_en["10039364"][0][1][0]

[('friends', 'NNS'),
 ('wonderful', 'JJ'),
 ('stay', 'VBP'),
 ('sun', 'JJ'),
 ("kyung's", 'NN'),
 ('place', 'NN')]

In [41]:
njvr_lemmantizer(review_en["10039364"][0][1][0])

['friend_N', 'wonderful_J', 'stay_V', 'sun_J', "kyung's_N", 'place_N']

In [43]:
review_en2 = review_en

In [42]:
for listing_id, rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        rev_tmp = []
        for sent in rev[1]:
            rev_tmp.append(njvr_lemmantizer(sent))
        rev[1] = rev_tmp

100%|███████████████████████████████████████████████████████████████████████████| 30569/30569 [00:58<00:00, 520.30it/s]


In [47]:
for listing_id, rev_lst in tqdm.tqdm(review_en2.items()):
    for rev in rev_lst:
        rev_tmp = []
        for sent in rev[1]:
            rev_tmp.append(njvr_lemmantizer2(sent))
        rev[1] = rev_tmp

  0%|                                                                                        | 0/30569 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)

In [45]:
review_en["10039364"][0][1][0]

['friend_N', 'wonderful_J', 'stay_V', 'sun_J', "kyung's_N", 'place_N']

In [35]:
review_en2["10039364"][0][1][0]

[('friends', 'NNS'),
 ('wonderful', 'JJ'),
 ('stay', 'VBP'),
 ('sun', 'JJ'),
 ("kyung's", 'NN'),
 ('place', 'NN')]