<a href="https://colab.research.google.com/github/silverstar0727/NLP_study/blob/main/ch4_LogisticRegression_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
cd /content/drive/My Drive/kaggle_nlp

/content/drive/My Drive/kaggle_nlp


In [None]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
DATA_IN_PATH = 'after_preprocessing/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = [] # list에 review를 단어로 구분하여 담음
for review in reviews:
  sentences.append(review.split())

In [None]:
# 학습시 필요한 파라미터
num_features = 300 # 워드 벡터 특징값 수(임베딩 된 차원을 정함)
min_word_count = 40 # 단어에 대한 최소 빈도 개수(의미있는 단어만 학습)
num_workers = 4 # 프로세스 개수
context = 10 # 컨텍스트 윈도우 크기
downsampling = 10e-3 # 다운 샘플링 비율

In [None]:
# word2vec을 gensim을 이용하여 학습
import logging
logging.basicConfig(format = '%(asctime)s : %(message)s', level = logging.INFO) # 진행상황 확인

In [None]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2020-09-30 04:03:39,347 : collecting all words and their counts
2020-09-30 04:03:39,350 : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-30 04:03:39,676 : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2020-09-30 04:03:39,983 : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2020-09-30 04:03:40,143 : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2020-09-30 04:03:40,144 : Loading a fresh vocabulary
2020-09-30 04:03:40,209 : effective_min_count=40 retains 8160 unique words (11% of original 74065, drops 65905)
2020-09-30 04:03:40,210 : effective_min_count=40 leaves 2627273 word corpus (87% of original 2988089, drops 360816)
2020-09-30 04:03:40,242 : deleting the raw counts dictionary of 74065 items
2020-09-30 04:03:40,245 : sample=0.01 downsamples 0 most-common words
2020-09-30 04:03:40,253 : downsampling leaves estimated 2627273 word corpus (100.0% of prior 2627

In [None]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)

    num_words = 0
    index2word_set = set(model.wv.index2word)

    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])

    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [None]:
def get_dataset(reviews, model, num_features):
    dataset = list()

    for s in reviews:
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [None]:
test_data_vecs = get_dataset(sentences, model, num_features)

  # Remove the CWD from sys.path while we load stuff.


In [None]:
TEST_SPLIT = 0.2
RANDOM_SEED = 142

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = test_data_vecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [None]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

predicted = lgs.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

test_review = list(test_data['review'])

------------
Accuracy: 0.859000
Precision: 0.862922
Recall: 0.860556
F1-Score: 0.861738
AUC: 0.935323


In [None]:
test_sentences = list()
for review in test_review:
    test_sentences.append(review.split())

test_data_vecs = get_dataset(test_sentences, model, num_features)

test_predicted = lgs.predict(test_data_vecs)

ids = list(test_data['id'])

answer_dataset = pd.DataFrame({'id': ids, 'sentiment': test_predicted})

  # Remove the CWD from sys.path while we load stuff.


In [None]:
DATA_OUT_PATH = 'output/'
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv', index=False, quoting=3)