# **1주차 피드백**
***  
**세부적인 전처리를 제외한다면 그냥 토큰화만 해도 무방하다고 생각함**  

1. ### **_정규표현식 활용한 punctuations 정리_**  
하이픈, 언더바, 슬래시와 같은 연결어를 찾아보았으며, 확인 결과 제거 혹은 띄어쓰기로 대체 가능

2. ### **_토크나이저 정리_**  
    1. TreebankWordTokenizer()  
    2. WordPunctTokenizer()  
    3. keras.text_to_word_sequence() <- 이게 제일 좋다

3. ### **_정제_**  
    1. 대소문자  
:대문자를 유지해야되는 단어 US, XI등이 있지만, 따로 사전같이 정리된것이 없어 일일이 제거하기 어려움  

    2. 단어의 길이는 2자부터
    3. 불용어 사용
    4. 표제어추출  
:일반 lemmatizer()의 문제는 직접 품사를 지정해줘야만 성능이 좋았다.  
따라서 구글링을 통해 pos-tagging 후 명사, 동사, 형용사, 부사 4개로 변환해주는 함수 생성


# Library

In [None]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# regex
import re

import warnings
warnings.filterwarnings('ignore')

# stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# tokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

# lemmatization
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/nlp_data/practice.csv')
data.head()

Unnamed: 0,index,text,score
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [None]:
print(data.shape)
data.text

0        He was almost choking. There was so much, so m...
1                   “Your sister asked for it, I suppose?”
2         She was engaged one day as she walked, in per...
3        The captain was in the porch, keeping himself ...
4        “Have mercy, gentlemen!” odin flung up his han...
                               ...                        
54874    “Is that you, Mr. Smith?” odin whispered. “I h...
54875    I told my plan to the captain, and between us ...
54876     "Your sincere well-wisher, friend, and sister...
54877              “Then you wanted me to lend you money?”
54878    It certainly had not occurred to me before, bu...
Name: text, Length: 54879, dtype: object

# Preprocess
### A-1. punctuations and tokenization
### tokenize 함수 쓰고 안쓰고 차이 비교
### a. 안쓰고

In [None]:
data.text[1]

'“Your sister asked for it, I suppose?”'

In [None]:
data.text.apply(lambda x :re.findall('\“',x))

0            []
1           [“]
2            []
3           [“]
4        [“, “]
          ...  
54874    [“, “]
54875        []
54876        []
54877       [“]
54878        []
Name: text, Length: 54879, dtype: object

In [None]:
# 다른 문자들의 존재 여부 확인 [^안에 든 문자들 뺀]
def show_not_alphanum(text):
    return re.findall("[^0-9a-zA-Z‘’\',._\- ]",text)

In [None]:
puncts = set(sum(data.text.apply(show_not_alphanum),[])) # 리스트 내 리스트 병합 및 중복 제거
print(puncts)

{'Ê', '‐', 'â', 'æ', 'ï', '/', ']', 'ö', 'ü', '#', 'º', '}', '[', '*', '£', ')', 'Æ', 'ê', '&', '”', 'ç', 'Œ', '"', 'à', 'œ', '?', '{', '“', 'ñ', '(', '!', ';', 'ô', 'è', 'ë', ':', 'î', 'ì', 'ù', 'ä', '—', 'é'}


In [None]:
# 라틴어에 대한 처리는 어떻게 할까?

# '—' '‐' '/' ''*' 문제가 되는건 연결어니까 그 부분만 찾아보자

In [None]:
list(set(sum(data.text.apply(lambda x : re.findall(".{,10}—.{,10}",x)),[])))[1:10] # '—' > ' '

['t them at ——, and the',
 'essed, sir—quite the ',
 'the street—”',
 're my son,—more to me',
 ' returned,—“nonsense.',
 ' own place—such an af',
 'broad day,—But this m',
 'to go upon—no evidenc',
 'l astonish—and perhap']

In [None]:
print(list(set(sum(data.text.apply(lambda x : re.findall(".{,10}‐.{,10}",x)),[])))[1:10]) # '‐' > ' '

# 예외처리 : To-morrow, to-morrow, to-day

['mn! Oh, to‐morrow I’l', ' to‐day, to‐mo', 'long, oval‐shaped fac', 'cil mark—n‐nothing! W', 'ner a cast‐iron tombs', 'ind him to‐day,” said', ' odins’ to‐morrow,” a', ' were well‐to‐do peop', 'r was good‐humored an']


In [None]:
list(set(sum(data.text.apply(lambda x : re.findall(".{,10}‐.{,10}",x)),[])))

[' as a blue‐tit.”',
 'mn! Oh, to‐morrow I’l',
 ' to‐day, to‐mo',
 'long, oval‐shaped fac',
 'cil mark—n‐nothing! W',
 'ner a cast‐iron tombs',
 'ind him to‐day,” said',
 ' odins’ to‐morrow,” a',
 ' were well‐to‐do peop',
 'r was good‐humored an',
 'you twenty‐five roubl',
 ' better to‐day. But I',
 'with a pen‐knife not ',
 'rldly well‐being too.',
 'es, of old‐fashioned ',
 'o the lock‐up and thr',
 'ad of self‐reaodintio',
 'eat motive‐power. Wha',
 'w of a non‐commission',
 'from ‘self‐laceration',
 'od‐by. Don’t ',
 'is so open‐hearted be',
 'he drawing‐room. odin',
 'rtain well‐bred nonch',
 '“Good‐by, odin! ',
 'ay not ill‐treat and ',
 'e to horse‐breeding, ',
 'h his yard‐long strid',
 'at me, ill‐treat me, ',
 'nor wonder‐working ik',
 'e was half‐way home h',
 't until to‐day he hes',
 'st, twenty‐five thous',
 'he drawing‐room the c',
 'the summer‐house in t',
 'n court to‐day. ‘All ',
 'y the back‐way, for h',
 'reclude so‐and‐ so (o',
 's. How ill‐humored yo',
 'ple’s bac

In [None]:
print(set(sum(data.text.apply(lambda x : re.findall(".{,10}/.{,10}",x)),[]))) # '/' 제거
list(set(sum(data.text.apply(lambda x : re.findall(".{,10}\*.{,10}",x)),[])))[1:20] # '*' 제거

{'23l 4s 9 1/2d is over'}


['om a swan.* Why is th',
 'd be swier* to believ',
 ' very gleg* at the ju',
 ' * * * * *',
 '* * * * *',
 't twine,” * I said. “',
 'the dirdum* of this d',
 'er rowpit!* Dod, odin',
 ' *February 1',
 'ounds boss* upon the ',
 'and raxing* drams to ',
 ': “A clour* on the he',
 ' *The author',
 ' [*] The eman',
 ' *A bouman i',
 ' * Dark as t',
 ' * The rally',
 ' * From Leby',
 ' the sough** gaed abr']

In [None]:
# 기존 연결어들도 확인 \',._\-
list(set(sum(data.text.apply(lambda x : re.findall("\W{,3}\'\W{,3}",x)),[])))[1:10] # 문자가 아닐때는 제거

[",'", ".,' ", "--' ", ",' '", ' "\'', "'! ", "-- '", ";'” ", ".--'"]

In [None]:
list(set(sum(data.text.apply(lambda x : re.findall(".{,10}_.{,10}",x)),[])))[1:10] # 제거

['What a _gigantic_ ',
 '“It is _The Mornin',
 'staken if _odin_ did ',
 "'_Non istwen",
 'big green _drap de da',
 ' say this _as I did y',
 '"Write _\'Vive la r',
 '“_I_ say a hor',
 ' hundred, _panie_. Wi']

In [None]:
list(set(sum(data.text.apply(lambda x : re.findall(".{,10}-.{,10}",x)),[])))[1:10] # 띄어쓰기

['ck to them--be with t',
 'onspicuous--gave him ',
 ' not think--’',
 'fortnight.--However, ',
 'rning home--reasons t',
 ' man fodin--money, in',
 ' the water-edge was a',
 ' is a dove-cote, some',
 'less apple-trees upon']

In [None]:
list(set(sum(data.text.apply(lambda x : re.findall(".{,5}-{2,}.{,5}",x)),[])))[1:10] # 띄어쓰기

['of me--and l',
 'Appin--no, n',
 'knots--and y',
 'u are--that ',
 'chool--so mu',
 'efly:--',
 'means--”',
 'o say--excep',
 ' wife--a sen']

In [None]:
print(data.text[119])
data.text = data.text.apply(lambda x : re.sub('o‐m','om',x))
data.text = data.text.apply(lambda x : re.sub('o-m','om',x))
data.text = data.text.apply(lambda x : re.sub('o‐d','od',x))
data.text = data.text.apply(lambda x : re.sub('o-d','od',x))

data.text = data.text.apply(lambda x : re.sub("[—‐]",' ',x))
data.text = data.text.apply(lambda x : re.sub("\W{1,2}\'\W{1,2}",' ',x))
data.text = data.text.apply(lambda x : re.sub("-{2,}",' ',x))
data.text = data.text.apply(lambda x : re.sub("\'\W{1,2}",' ',x))
data.text = data.text.apply(lambda x : re.sub("\W{1,2}\'",' ',x))
data.text = data.text.apply(lambda x : re.sub("^\'",'',x))

data.text = data.text.apply(lambda x : re.sub("[^0-9a-zA-Z\'\- ]",'',x))
print(data.text[119])

'I, too, am glad of that,' said odin.
I too am glad of that said odin


In [None]:
# 기존 연결어들도 확인 \',._\-
list(set(sum(data.text.apply(lambda x : re.findall(".{,5}\'.{,5}",x)),[])))[1:10] # 굿

["woman's gra",
 "eed I've fo",
 "din I'll wr",
 "Hatch's cal",
 " hasn't pai",
 "And I'm not",
 "Don't com",
 "Ten o'clock",
 "aster's son"]

In [None]:
list(set(sum(data.text.apply(lambda x : re.findall(".{,5}-.{,5}",x)),[])))[1:10] # 굿

[' well-infor',
 'r gun-fire ',
 'o top-boots',
 ' lady-like ',
 'Heigh-ho-hu',
 'e-box',
 ' half-cover',
 ' well-remem',
 'a pug-dog H']

In [None]:
for i in range(230,290):
    print(i,data.text[i],'\n')

230 The prisoner had got into a coach and his daughter had followed him when Mr odin's feet were arrested on the step by his asking miserably for his shoemaking tools and the unfinished shoes Madame odin immediately called to her husband that she would get them and went knitting out of the lamplight through the courtyard She quickly brought them down and handed them in and immediately afterwards leaned against the door-post knitting and saw nothing 

231 Yes when I left her I told her so 

232 I carried you in my arms as a baby he observed 

233 Next moment he was gone 

234 Me cried odin the blood starting to his face I give ye the lie 

235 Oh no particular reason I meant to ask you before many people are unbelievers nowadays especially Russians I have been told You ought to know youve lived abroad 

236 Well was the reply ye can put it down upon the doorstep and be off with ye 

237 Dont think of it Its shameful to ask the question How is it possible to pray for the peace of a livin

In [None]:
hand_token = data.text.apply(lambda x : x.split())

### b. tokenize 사용 및 비교

In [None]:
data = pd.read_csv('nlp_data/practice.csv')

In [None]:
data.text = data.text.apply(lambda x : re.sub(r"[^A-Za-z0-9\-\' ]", '', x))

In [None]:
#토큰화 간 비교
tokenizer1 = TreebankWordTokenizer()
tokenizer2 = WordPunctTokenizer()
# 3 : text_to_word_sequence()

for i in [235, 272, 284]:
    print(i,tokenizer1.tokenize(data.text[i]),'\n')
    print(i,tokenizer2.tokenize(data.text[i]),'\n')
    print(i,text_to_word_sequence(data.text[i]),'\n')

235 ['Oh', 'no', 'particular', 'reason', 'I', 'meant', 'to', 'ask', 'you', 'before', '--', 'many', 'people', 'are', 'unbelievers', 'nowadays', 'especially', 'Russians', 'I', 'have', 'been', 'told', 'You', 'ought', 'to', 'know', '--', 'youve', 'lived', 'abroad'] 

235 ['Oh', 'no', 'particular', 'reason', 'I', 'meant', 'to', 'ask', 'you', 'before', '--', 'many', 'people', 'are', 'unbelievers', 'nowadays', 'especially', 'Russians', 'I', 'have', 'been', 'told', 'You', 'ought', 'to', 'know', '--', 'youve', 'lived', 'abroad'] 

235 ['oh', 'no', 'particular', 'reason', 'i', 'meant', 'to', 'ask', 'you', 'before', 'many', 'people', 'are', 'unbelievers', 'nowadays', 'especially', 'russians', 'i', 'have', 'been', 'told', 'you', 'ought', 'to', 'know', 'youve', 'lived', 'abroad'] 

272 ['Leave', 'him', 'to', 'me', 'said', 'odin', 'He', "'s", 'my', 'man', 'and', 'I', "'ll", 'get', 'him', 'sure', 'if', 'I', 'have', 'to', 'wait', 'a', 'year', 'for', 'him'] 

272 ['Leave', 'him', 'to', 'me', 'said', 'o

In [None]:
for i in [235, 272, 284]:
    print(i,text_to_word_sequence(data.text[i]),'\n')
    print(i,hand_token[i],'\n')
    
#keras tokenization이 좋은것 같다.

235 ['oh', 'no', 'particular', 'reason', 'i', 'meant', 'to', 'ask', 'you', 'before', 'many', 'people', 'are', 'unbelievers', 'nowadays', 'especially', 'russians', 'i', 'have', 'been', 'told', 'you', 'ought', 'to', 'know', 'youve', 'lived', 'abroad'] 

235 ['Oh', 'no', 'particular', 'reason', 'I', 'meant', 'to', 'ask', 'you', 'before', 'many', 'people', 'are', 'unbelievers', 'nowadays', 'especially', 'Russians', 'I', 'have', 'been', 'told', 'You', 'ought', 'to', 'know', 'youve', 'lived', 'abroad'] 

272 ['leave', 'him', 'to', 'me', 'said', 'odin', "he's", 'my', 'man', 'and', "i'll", 'get', 'him', 'sure', 'if', 'i', 'have', 'to', 'wait', 'a', 'year', 'for', 'him'] 

272 ['Leave', 'him', 'to', 'me', 'said', 'odin', "He's", 'my', 'man', 'and', "I'll", 'get', 'him', 'sure', 'if', 'I', 'have', 'to', 'wait', 'a', 'year', 'for', 'him'] 

284 ['but', 'how', 'came', 'this', 'asked', 'odin', 'open', 'eyed', 'with', 'astonishment'] 

284 ['But', 'how', 'came', 'this', 'asked', 'odin', 'open-eyed',

In [None]:
#최종코드
data = pd.read_csv('nlp_data/practice.csv')
data.text = data.text.apply(lambda x : re.sub("-{2,}",' ',x))
data.text = data.text.apply(lambda x : re.sub(r"[^A-Za-z0-9\-\' ]", '', x))
data.text = data.text.apply(lambda x : text_to_word_sequence(x))
data.text

0        [he, was, almost, choking, there, was, so, muc...
1               [your, sister, asked, for, it, i, suppose]
2        [she, was, engaged, one, day, as, she, walked,...
3        [the, captain, was, in, the, porch, keeping, h...
4        [have, mercy, gentlemen, odin, flung, up, his,...
                               ...                        
54874    [is, that, you, mr, smith, odin, whispered, i,...
54875    [i, told, my, plan, to, the, captain, and, bet...
54876    [your, sincere, well, wisher, friend, and, sis...
54877        [then, you, wanted, me, to, lend, you, money]
54878    [it, certainly, had, not, occurred, to, me, be...
Name: text, Length: 54879, dtype: object

### A-2. normalize ( capitals, shortwords, stopwords, lemma..)
### capitals

In [None]:
#대문자 확인
def check_caps(all_list):
    empty = []
    for i in all_list:
        if i.isupper() == True:
            empty.append(i)
    return empty

In [None]:
set(sum(hand_token.apply(lambda x : check_caps(x)),[]))
# US VI VII VIII X XI 등 소문자로 바꾸면 안되는것들이 있음

{'126B',
 '146M',
 '16A',
 '221B',
 '7000L',
 '77B',
 'A',
 'ABOUT',
 'ACCIDENT',
 'ACCOUNT',
 'ACQUAINTANCE',
 'ACQUAINTANCES',
 'ACQUAINTED',
 'ACQUITTED',
 'ACROSS',
 'ACTION',
 'ADLER',
 'ADMINISTERING',
 'ADMIRED',
 'ADVENTURE',
 'ADVENTURES',
 'AFFORDING',
 'AFTER',
 'AINT',
 'AIR',
 'ALL',
 'ALMOST',
 'ALONE',
 'ALREADY',
 'ALTAMONT',
 'ALWAYS',
 'AM',
 'AMEND-ALL',
 'AMERICA',
 'AN',
 'AND',
 'ANOTHER',
 'ANY',
 'APPEARS',
 'APPERTAINING',
 'ARDOLIONOVITCH',
 'ARE',
 'ARREST',
 'ARRIVES',
 'ARTFUL',
 'ARTS',
 'AS',
 'ASIA',
 'ASSASSIN',
 'ASSOCIATES',
 'ASTONISHES',
 'AT',
 'ATTENDING',
 "AUTHOR'S",
 'AWAKING',
 'B',
 'BACHELOR',
 'BAND',
 'BANDBOX',
 'BARSAD',
 'BE',
 'BEADLE',
 'BEAULIEU',
 'BECAME',
 'BECOMES',
 'BEECHES',
 'BEEN',
 'BEFELL',
 'BEFORE',
 'BEG',
 'BEGAN',
 'BEING',
 'BELOW',
 'BELT',
 'BERTHELINI',
 'BERYL',
 'BETTER',
 'BETWEEN',
 'BIRCHES',
 'BIRTH',
 'BLACKSMITH',
 'BLINDS',
 'BLUE',
 'BOARD',
 'BOHEMIA',
 'BON',
 'BOOK',
 'BORN',
 'BOTH',
 'BRIG',
 'BRING

In [None]:
hand_tokens = sum(hand_token,[])

In [None]:
hand_tokens = [re.sub("US", "usa" ,x) for x in hand_tokens]

# 문제 없는 애들은 모두 소문자로 변경
hand_tokens = [x.lower() for x in hand_tokens]

### shortwords

In [None]:
# 2자 이상 단어만
hand_tokens = [i for i in hand_tokens if len(i)>1]

### stopwords

In [None]:
# 불용어 처리
stop_words_list = stopwords.words('english')
hand_tokens = [i for i in hand_tokens if i not in stop_words_list]

### lemmatization

In [None]:
# 표제어 추출
lemmatizer = WordNetLemmatizer()
lemma_toks = [lemmatizer.lemmatize(x) for x in hand_tokens]
print(len(set(hand_tokens)),len(set(lemma_toks))) #단어가 줄긴했음

39444 35020


#### 기존의 표제어 추출보다 더 빡세게(?) 추출하는 함수
기존에는 제대로 된 표제어 추출을 위해서는 품사를 지정하는 과정이 필요했다. 이 과정이 없다면 ing랑 원형을 구분하게 된다.  
하지만 이 함수는 pos_tagging으로 구한 품사를 4개로 통일한다.  

In [None]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
  #특정 단어의 품사(ex. VBD)의 앞글자(V)를 대문자로 추출
    tag = nltk.pos_tag([word])[0][1][0].upper() 

  #품사 딕셔너리 생성
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# 단어 표제어 추출
lemmatizer = WordNetLemmatizer()
word = 'was'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 문장 표제어 추출
sentence = "i was a car"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

be
['i', 'be', 'a', 'car']


### stemming

### 최종 정리 함수

In [None]:
#데이터프레임 내 텍스트를 토큰화하는 함수

def all_process(datas):
    
    datas.text = datas.text.apply(only_alphanum)
    tokens = datas.text.apply(lambda x : tokenizer.tokenize(x))
    tok=[]
    
    for i in tokens.index:
      tok.extend(tokens[i])
    
    capitals = list(check_caps(tok)) # 만약 대문자는 소문자로 안바꾼다면
    capitals = [] #특정 문자만 소문자로 안바꾼다면
    tok = [i for i in tok if i not in set(capitals)]
    
    tok = [x.lower() for x in tok]
    tok.extend(capitals)

    tok = [i for i in tok if len(i)>1]
    
    tok = [i for i in tok if i not in stop_words_list]

    lemmatizer = WordNetLemmatizer()
    lemma_toks = [lemmatizer.lemmatize(x) for x in tok]

    return lemma_toks

### 정수 인코딩

In [None]:
Tokenizer = Tokenizer()
Tokenizer.fit_on_texts(lemma_toks)

In [None]:
Tokenizer.word_counts

OrderedDict([('almost', 1140),
             ('choking', 15),
             ('much', 3359),
             ('wanted', 525),
             ('say', 4065),
             ('strange', 734),
             ('exclamation', 100),
             ('came', 2625),
             ('lip', 446),
             ('pole', 77),
             ('gazed', 145),
             ('fixedly', 24),
             ('bundle', 115),
             ('note', 531),
             ('hand', 3906),
             ('looked', 2148),
             ('odin', 38561),
             ('evident', 149),
             ('perplexity', 60),
             ('sister', 978),
             ('asked', 2130),
             ('suppose', 749),
             ('engaged', 255),
             ('one', 7912),
             ('day', 2979),
             ('walked', 656),
             ('perusing', 5),
             ('janes', 18),
             ('last', 2586),
             ('letter', 1108),
             ('dwelling', 34),
             ('passage', 302),
             ('proved', 142),
             (

In [None]:
encoded = tokenizer.texts_to_sequences(lemma_toks)

In [None]:
encoded[1:10]

[[6674], [20], [20], [324], [13], [232], [1725], [37], [401]]