In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 데이터 읽기
ratings_train = pd.read_csv("data-files/ratings_train.txt", sep="\t")
ratings_test = pd.read_csv("data-files/ratings_test.txt", sep="\t")

In [3]:
# 데이터 확인 1
ratings_train.head()
ratings_test.head()
print( ratings_train.shape, ratings_test.shape )
print( ratings_train['label'].mean(), ratings_test['label'].mean() )
print( np.unique(ratings_train['label'], return_counts=True) )

(150000, 3) (50000, 3)
0.49884666666666666 0.50346
(array([0, 1]), array([75173, 74827]))


In [4]:
# 데이터 확인 2 --> missing value detected
ratings_train.info()
ratings_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [5]:
# 결측값 처리

na_mask = ratings_train["document"].isna()
ratings_na_removed_train = ratings_train[~na_mask].copy() # ~ : not, & : and, | : or
na_mask = ratings_test["document"].isna()
ratings_na_removed_test = ratings_test[~na_mask].copy()

In [6]:
# 데이터 전처리 1 : 한글 문자와 영문자만 보존

# ratings_na_removed_train.tail()
import re

ratings_na_removed_train['document'] = \
ratings_na_removed_train['document'].map(lambda v: re.sub("[^A-Za-zㄱ-힣]", ' ', v))
ratings_na_removed_test['document'] = \
ratings_na_removed_test['document'].map(lambda v: re.sub("[^A-Za-zㄱ-힣]", ' ', v))

In [7]:
# 한국어 처리 패키지 설치
# !pip install Jpype1
# !pip install konlpy

In [9]:
# 형태소 분해 : 문장 -> 형태소 리스트

from konlpy.tag import Okt # 한국어 사전 준비 ( 형태소 분할에 사용)

okt = Okt()
print( okt.morphs(ratings_na_removed_train['document'][0]) )

ratings_na_removed_train['document'] = ratings_na_removed_train['document'].map(okt.morphs)
ratings_na_removed_test['document'] = ratings_na_removed_test['document'].map(okt.morphs)

['아', '더빙', '진짜', '짜증나네요', '목소리']


In [10]:
ratings_na_removed_train.to_csv("data-files/ratings_train2.csv", index=False)
ratings_na_removed_test.to_csv("data-files/ratings_test2.csv", index=False)

In [8]:
# 데이터 다시 읽어오기
ratings_train2 = pd.read_csv("data-files/ratings_train2.csv")
ratings_test2 = pd.read_csv("data-files/ratings_test2.csv")

In [9]:
# 문제 확인 : document 컬럼이 문자열 형식으로 저장됨 -> 리스트로 변경 필요
ratings_train2.info()
ratings_train2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149995 entries, 0 to 149994
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        149995 non-null  int64 
 1   document  149995 non-null  object
 2   label     149995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


Unnamed: 0,id,document,label
0,9976970,"['아', '더빙', '진짜', '짜증나네요', '목소리']",0
1,3819312,"['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기'...",1
2,10265843,"['너', '무재', '밓었', '다그', '래서', '보는것을', '추천', '한...",0
3,9045019,"['교도소', '이야기', '구먼', '솔직히', '재미', '는', '없다', '...",0
4,6483659,"['사이', '몬페', '그', '의', '익살스런', '연기', '가', '돋보였...",1


In [10]:
# document 컬럼 자료형 변경 : string -> list

print( "['123', '456', '789']"[1] )
print( ['123', '456', '789'][1] )
print( eval("['123', '456', '789']")[1] ) # eval : 문자열 -> 파이썬 코드

ratings_train2['document'] = ratings_train2['document'].map(eval)
ratings_test2['document'] = ratings_test2['document'].map(eval)


'
456
456


In [11]:
# 변환 확인
type( ratings_train2['document'][0] ), type( ratings_test2['document'][0] )

(list, list)

In [12]:
# token 리스트 -> 단일 문자열
ratings_train2['document'] = ratings_train2['document'].str.join(' ')
ratings_test2['document'] = ratings_test2['document'].str.join(' ')

In [14]:
# 전처리 결과 파일에 저장
ratings_train2.to_csv("data-files/processed_ratings_train.csv", index=False)
ratings_test2.to_csv("data-files/processed_ratings_test.csv", index=False)

In [2]:
# 데이터 다시 읽어오기
ratings_train3 = pd.read_csv("data-files/processed_ratings_train.csv")
ratings_test3 = pd.read_csv("data-files/processed_ratings_test.csv")

ratings_train3 = ratings_train3.dropna()
ratings_test3 = ratings_test3.dropna()

In [3]:
# 문자열 -> 숫자 ( encoding )
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

cv = CountVectorizer(ngram_range=(1, 2))
cv_train_counts = cv.fit_transform(ratings_train3['document'])
cv_test_counts = cv.transform(ratings_test3['document'])

tv = TfidfVectorizer(ngram_range=(1, 2))
tv_train_counts = tv.fit_transform(ratings_train3['document'])
tv_test_counts = tv.transform(ratings_test3['document'])

tt = TfidfTransformer()
tt_train_counts = tt.fit_transform(cv_train_counts)
tt_test_counts = tt.transform(cv_test_counts)

In [40]:
# print( cv_train_counts )
print( len(cv.vocabulary_), type(cv.vocabulary_) )
print( list(cv.vocabulary_.keys())[-10:] )

818928 <class 'dict'>
['거들 먹거리', '먹거리 필리핀', '필리핀 혼혈', '혼혈 착하다', '최고봉 방황', '방황 우울했던', '우울했던 자화상', '수간', '최초 수간', '수간 하는']


In [None]:
idx_to_token = { idx: token for token, idx in cv.vocabulary_.items() }
print( ratings_test3['document'][2] )
print(idx_to_token[111482], idx_to_token[111500], idx_to_token[171807])
print( cv_test_counts )

뭐 야 이 평점 들 은 나쁘진 않지만 점 짜 리 는 더 더욱 아니잖아
나쁘진 나쁘진 않지만 더욱
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 561244 stored elements and shape (49874, 818928)>
  Coords	Values
  (2, 111482)	1
  (2, 111500)	1
  (2, 171807)	1
  (2, 425319)	1
  (2, 446076)	1
  (2, 750576)	1
  (3, 223683)	1
  (3, 301880)	1
  (3, 302241)	1
  (3, 445599)	1
  (3, 480792)	1
  (3, 525390)	1
  (3, 525658)	1
  (3, 669106)	1
  (3, 669178)	1
  (3, 685301)	1
  (4, 115910)	1
  (4, 154647)	1
  (4, 337067)	1
  (4, 410384)	1
  (4, 425039)	1
  (4, 680155)	1
  (4, 776323)	1
  (5, 505550)	1
  (5, 553761)	2
  :	:
  (49871, 303442)	1
  (49871, 336671)	1
  (49871, 336672)	1
  (49871, 525180)	1
  (49871, 525219)	1
  (49871, 662523)	1
  (49872, 91203)	1
  (49872, 91544)	1
  (49872, 367496)	1
  (49872, 367548)	1
  (49872, 376549)	1
  (49872, 376578)	1
  (49872, 470323)	1
  (49872, 470429)	1
  (49872, 482763)	1
  (49872, 505550)	1
  (49872, 511656)	1
  (49872, 622112)	1
  (49872, 622664)	1
  (49872, 641504)	1
  (49872, 641662

In [41]:
# 모델 훈련 및 평가
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(tv_train_counts, ratings_train3['label'])

print( logreg.score(tv_train_counts, ratings_train3['label']) )
print( logreg.score(tv_test_counts, ratings_test3['label']) )

0.9174202172096909
0.8390544171311706
