<a href="https://colab.research.google.com/github/seojeongyun/Word2Vec/blob/main/Word2Vec_from_Gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import urllib

if __name__ == '__main__':
    # 네이버 영화리뷰 데이터 다운로드
    urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt",
                               filename="/content/drive/My Drive/ratings_train.txt")
    urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt",
                               filename="/content/drive/My Drive/ratings_test.txt")

In [4]:
!pip install konlpy # 한국어 형태소 분석기

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [119]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
import urllib
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt

class data_loader:
    def __init__(self, type: str):
        self.train_data = pd.read_table('/content/drive/My Drive/ratings_train.txt')    # By using pandas, data save
        self.test_data = pd.read_table('/content/drive/My Drive/ratings_test.txt')      # By using pandas, data save
        self.stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다'] # Set stopwords
        self.Okt = Okt() # Tokenizer
        self.mode = self.set_mode(type)

    def set_mode(self, type: str):  # Set the dataset type (train or test)
        if type == 'train':
            self.data = self.train_data
        elif type == 'test':
            self.data = self.test_data
        else:
            raise NotImplementedError

        return type

    def print_data_len(self):
        print('The number of reviews from {} dataset : '.format(self.mode), len(self.data))

    def check_duplication(self):  # Check duplications column of document and label
        print('There are {} unique value in document column of {} dataset'.format(self.data['document'].nunique(), self.mode))

    def process(self):
        # Remove duplications column of document
        self.data.drop_duplicates(subset=['document'], inplace=True)
        print('The number of {} dataset after removal for duplications : '.format(self.mode),len(self.data))

        # Remove the samples with null value
        self.data = self.data.dropna(how='any')
        print('\n\nThe number of {} dataset after removal for null values : '.format(self.mode),len(self.data))

        # Remove special characters with regular expression
        print('\n\nBefore removal of special charaters')
        print(self.data['document'])

        self.data['document'] = self.data['document'].str.replace(pat=r'[^\w]', repl=r'', regex=True)
        print('\n\nAfter removal of special charaters')
        print(self.data['document'])

        # Change the white space value to Null value and then remove
        # When the comment is written only special characters, some comments can be changed to white space.
        self.data['document'] = self.data['document'].str.replace('^ +', "")  # change the white space to empty value
        self.data['document'].replace('', np.nan, inplace=True)

        if self.data.isnull().sum().iloc[1] != 0: # in document column
            print('\n\nThe number of null values in {} dataset : '.format(self.mode), self.data.isnull().sum().iloc[1])
            print('Remove the samples with null value')
            self.data = self.data.dropna(how='any')

        else:
            print('\n\n\n')

        print('\n\nTotal length of the data : ', len(self.data))
        print('\n\n\n')

    def get_item(self):
        removed_stopword = []
        for sentence in tqdm(self.data['document']):
            tokenized_sentence = (self.Okt.morphs(sentence, stem=True))  # tokenize
            stopwords_removed_sentence = [word for word in tokenized_sentence if not word in self.stopwords]  # remove stopwords
            removed_stopword.append(stopwords_removed_sentence)

        y_train = np.array(self.data['label']) # get labels (0 or 1)

        return removed_stopword, y_train

In [120]:
data_loader = data_loader(type='train')

In [121]:
data_loader.print_data_len()

The number of reviews from train dataset :  150000


In [122]:
data_loader.check_duplication()

There are 146182 unique value in document column of train dataset


In [123]:
data_loader.process()

The number of train dataset after removal for duplications :  146183


The number of train dataset after removal for null values :  146182


Before removal of special charaters
0                                       아 더빙.. 진짜 짜증나네요 목소리
1                         흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
2                                         너무재밓었다그래서보는것을추천한다
3                             교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
4         사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...
                                ...                        
149995                                  인간이 문제지.. 소는 뭔죄인가..
149996                                        평점이 너무 낮아서...
149997                      이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?
149998                          청춘 영화의 최고봉.방황과 우울했던 날들의 자화상
149999                             한국 영화 최초로 수간하는 내용이 담긴 영화
Name: document, Length: 146182, dtype: object


After removal of special charaters
0                                             아더빙진짜짜증나네요목소리
1                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['document'] = self.data['document'].str.replace(pat=r'[^\w]', repl=r'', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['document'] = self.data['document'].str.replace('^ +', "")  # change the white space to empty value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['document'].replace('', np.nan, inplace=True)


In [125]:
# Check the ratio for class
data_loader.data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,73286
1,72800


In [126]:
# Check the null value from train set
print(data_loader.data.isnull().values.any())
print(data_loader.data.isnull().sum())
data_loader.data.loc[data_loader.data.document.isnull()]

False
id          0
document    0
label       0
dtype: int64


Unnamed: 0,id,document,label


In [127]:
x_input,y_label = data_loader.get_item()

100%|██████████| 146086/146086 [1:06:38<00:00, 36.54it/s]


# gensim 라이브러리의 Word2Vec 모델 파라미터

**sentences** = 입력 데이터 문장.

**workers** : 실행할 병렬 프로세스의 수.

**size** : 각 단어에 대한 임베딩 된 벡터차원 정의. 만약 size=2 라면 한 문장의 벡터는 [-0.1248574, 0.255778]와 같은 형태를 가지게 된다.

**min_count** : 단어에 대한 최소 빈도수. min_count=5라면 빈도수 5 이하 무시

**window** : 문맥 윈도우 수, 양쪽으로 몇 개의 단어까지 고려해서 의미를 파악할 것인지 지정하는 것

**sample** : 빠른 학습을 위해 정답 단어 라벨에 대한 다운샘플링 비율을 지정하는 것, 보통 0.001이 좋은 성능을 낸다고 한다.

**sg** : 1이면 skip-gram 방법을 사용하고, 0이면 CBOW 방법을 사용한다.

***iter*** : epoch와 같은 뜻으로 학습 반복 횟수를 지정한다.

In [131]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = x_input, vector_size = 300, window = 5, min_count = 3, workers = 4, sg = 1, sample=0.001)


In [135]:
print(model.wv.most_similar("송강호"))

[('한석규', 0.8584898710250854), ('유해진', 0.8473111987113953), ('액션연기', 0.842033326625824), ('문채원', 0.839417576789856), ('안성기', 0.8384109139442444), ('신들리다', 0.8357061147689819), ('패닝', 0.8349151611328125), ('윤계상', 0.8343048095703125), ('류덕환', 0.8294517993927002), ('김승우', 0.8286385536193848)]
