# Preprocessing


1. 원본 데이터 정리 (중복 제거, 빈 곳 삭제 등) 및 카테고라이징

In [None]:
import pandas as pd
import emoji

def remove_emojis(text: str) -> str:
    return ''.join(c for c in text if c not in emoji.UNICODE_EMOJI)

file_name = '불러올 파일명'

df = pd.read_excel(file_name)
df = df.drop_duplicates(ignore_index = True)
df = df.dropna(how='all',axis=0)

2. Okt를 통한 토큰화

In [None]:
from konlpy.tag import Okt

okt = Okt()

for i in range(len(df)):       
    df.loc[i,'tokenized'] = str(okt.pos(remove_emojis(df.loc[i,'content']), norm=True, stem=True))


3. 불용어 제거 및 명사, 동사, 부사, 형용사만 추출

In [None]:
import ast

tags = ['Noun', 'Verb', 'Adverb', 'Adjective']
stopwords = ['하다', '없다', '있다', '되다', '아니다', '같다', '이다', '않다', '그렇다', 
             '이렇다', '싶다', '다', '것', '그', '이', '거']

for i in range(len(df)):
    pos_list = ast.literal_eval(df.loc[i, 'tokenized'])
    
    final = []
    
    for j in range(len(pos_list)):
        if pos_list[j][1] in tags:
            if pos_list[j][0] not in stopwords:
                final.append(pos_list[j][0])
    
    df.loc[i, 'tokenized'] = str(final)


# Sentiment Analysis

1. 한국어 감성 사전 로드

In [None]:
f = open('data/SentiWord_Dict.txt', 'r', -1, 'utf-8')
lines = f.readlines()

score_dict = []

for line in lines:
    line_splited = line.split()
    score = int(line_splited[-1])
    word = ''
    for frac in line_splited[:-1]:
        word = word + ' ' + frac
    
    word = word[1:]
    score_dict.append([word, score])

2. 감성 지수 산출

In [None]:
import ast
df['positive'] = 0
df['negative'] = 0
df['neutral'] = 0

for i in range(len(df)):
    tokens = ast.literal_eval(df.loc[i, 'tokenized'])
    for token in tokens:
        for dict_word in score_dict:
            if dict_word[0] == token:
                if dict_word[1] > 0:
                    df.loc[i, 'positive'] += dict_word[1]
                elif dict_word[1] < 0:
                    df.loc[i, 'negative'] += dict_word[1]
                else:
                    df.loc[i, 'neutral'] += 1 #중립어는 개수 세기

3. 지수에 따른 라벨링

In [None]:
df['score'] = df['positive'] + df['negative']

for i in range(len(df)):
    if df.loc[i, 'score'] > 0:
        label = 'Positive'
    elif df.loc[i, 'score'] < 0:
        label = 'Negative'
    else:
        label = 'Neutral'    
    
    df.loc[i, 'label'] = label