<a href="https://colab.research.google.com/github/pms512/sentimental_analysis/blob/main/%EA%B0%90%EC%84%B1%EB%B6%84%EB%A5%98_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/tripadviser_review.csv")

In [6]:
df.head()

Unnamed: 0,rating,text
0,4,여행에 집중할수 있게 편안한 휴식을 제공하는 호텔이었습니다. 위치선정 또한 적당한 ...
1,4,"2일 이상 연박시 침대, 이불, 베게등 침구류 교체 및 어메니티 보강이 필요해 보입..."
2,4,지인에소개로온 호텔 깨끗하고 좋은거같아요 처음에는 없는게 많아 많이 당황했는데 ...
3,5,방에 딱 들어서자마자 눈이 휘둥그레질정도로 이렇게 넓은 호텔 처음 와본 것 같아요!...
4,5,저녁에 맥주한잔 하는게 좋아서 렌트 안하고 뚜벅이 하기로 했는데 호텔 바로 앞에 버...


In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df['text'][0]

In [None]:
df['text'][100]

In [None]:
len(df['text'].values.sum())

In [None]:
#한글 형태소 분석기 라이브러리
!pip install konlpy==0.5.1 jpype1 jpype1-py3

In [None]:
#정규 표현식을 활용한 데이터 정제
import re

def apply_regular_expression(text):
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]')
    result = hangul.sub('',text)
    return result

In [None]:
apply_regular_expression(df['text'][0])

'여행에 집중할수 있게 편안한 휴식을 제공하는 호텔이었습니다 위치선정 또한 적당한 편이었고 청소나 청결상태도 좋았습니다'

In [None]:
#명사 형태소 추출
from konlpy.tag import Okt
from collections import Counter

nouns_tagger = Okt()
nouns = nouns_tagger.nouns(apply_regular_expression(df['text'][0]))
nouns

['여행', '집중', '휴식', '제공', '호텔', '위치', '선정', '또한', '청소', '청결', '상태']

In [None]:
#corpus를 넣어서 명사 형태소 추출
nouns = nouns_tagger.nouns(apply_regular_expression("".join(df['text'].tolist())))

In [None]:
counter = Counter(nouns)
counter.most_common(20)

In [None]:
#한글자짜리 명사 제거
available_counter = Counter({x : counter[x] for x in counter if len(x) > 1})
available_counter.most_common(10)

In [None]:
#불용어 제거
stopwords = pd.read_csv('https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt').values.tolist()
stopwords[:10]

In [None]:
jeju_list = ['제주', '제주도','호텔', '리뷰','숙소','여행','트립']
for word in jeju_list : 
    stopwords.append(word)

In [None]:
#BoW 벡터 생성
from sklearn.feature_extraction.text import CountVectorizer

def text_cleaning(text) : 
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]')
    result = hangul.sub('', text)
    tagger = Okt()
    nouns = nouns_tagger.nouns(result)
    nouns = [x for x in nouns if len(x) > 1]
    nouns = [x for x in nouns if x not in stopwords]
    return nouns

vect = CountVectorizer(tokenizer = lambda x : text_cleaning(x))
bow_vect = vect.fit_transform(df['text'].tolist())
word_list = vect.get_feature_names()
count_list = bow_vect.toarray().sum(axis=0)

In [None]:
print(bow_vect)

In [None]:
word_count_dict=dict(zip(word_list, count_list))
print(str(word_count_dict)[:100])

In [None]:
#TF-IDF 변환
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_vectorizer = TfidfTransformer()
tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)

In [None]:
print(tf_idf_vect[0])

In [None]:
#벡터-단어 맵핑
invert_index_vectorizer = {v : k for k,v in vect.vocabulary_.items()}
print(str(invert_index_vectorizer)[:100])

In [None]:
#4~5점은 긍정(1), 나머지는 부정(0)
def rating_to_label(rating):
    if rating > 3 : 
        return 1
    else :
        return 0

df['y'] = df['rating'].apply(lambda x : rating_to_label(x))

In [None]:
df.head()

In [None]:
#데이터셋 분리
from sklearn.model_selection import train_test_split

y = df['y']
x_train, x_test, y_train, y_test = train_test_split(tf_idf_vect, y, test_size = 0.3)

In [None]:
#로지스틱 회귀분석
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)


In [None]:
#분류 결과 평가
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

In [None]:
#confusion matrix 
from sklearn.metrics import confusion_matrix

confmat = confusion_matrix(y_test, y_pred)
print(confmat)
#데이터 불균형. 샘플링 다시 해보자

In [None]:
#re-sampling
positive_random_idx = df[df['y'] == 1].sample(275, random_state = 33).index.tolist()
negative_random_idx = df[df['y'] == 0].sample(275, random_state = 33).index.tolist()    

random_idx = positive_random_idx + negative_random_idx
second_x = tf_idf_vect[random_idx]
second_y = df['y'][random_idx]

In [None]:
#데이터셋 분리
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(second_x, second_y, test_size = 0.3)

In [None]:
#로지스틱 회귀분석
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

In [None]:
#분류 결과 평가
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

In [None]:
#confusion matrix 
from sklearn.metrics import confusion_matrix

confmat = confusion_matrix(y_test, y_pred)
print(confmat)

In [None]:
#긍정/부정 키워드 분석
plt.rcParams['figure.figsize'] = [10,8]
plt.bar(range(len(lr.coef_[0])) ,lr.coef_[0])

In [None]:
print(sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse=True)[:5])
print(sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse=True)[-5:])

In [None]:
coef_pos_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = True)
coef_neg_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = False)

In [None]:
for coef in coef_pos_index[:15] :
    print(invert_index_vectorizer[coef[1]], coef[0])

In [None]:
for coef in coef_neg_index[:15] :
    print(invert_index_vectorizer[coef[1]], coef[0])