In [1]:
import pandas as pd
import numpy as np

데이터 불러오기

In [2]:
with open('2016_filtered_review_part.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

평점 정보를 이용해서 종속변수 레이블링 하기

In [3]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 < score < 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, 0
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score >= 8 else 0)

In [4]:
# To split the data into training and test datasets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.1, random_state=42)

In [5]:
values, n_samples = np.unique(train_labels, return_counts=True)
print('0: {0} \n1: {1} \ntotal: {2}'.format(n_samples[0], n_samples[1], len(train_labels)))

0: 1485 
1: 16076 
total: 17561


## 불균형 문제를 해결하지 않은 경우

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer() 
tf_train_features = tf_vectorizer.fit_transform(train_texts) 
tf_test_features = tf_vectorizer.transform(test_texts)

In [8]:
lr_tf_l2 = LogisticRegression(C=0.1, penalty='l2', solver='saga', max_iter=10000)

In [9]:
lr_tf_l2.fit(tf_train_features, train_labels) # 학습

LogisticRegression(C=0.1, max_iter=10000, solver='saga')

In [10]:
pred_labels_tf_l2 = lr_tf_l2.predict(tf_test_features) # 예측

In [11]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels_tf_l2))

              precision    recall  f1-score   support

           0       0.73      0.23      0.35       138
           1       0.94      0.99      0.97      1814

    accuracy                           0.94      1952
   macro avg       0.84      0.61      0.66      1952
weighted avg       0.93      0.94      0.92      1952



In [12]:
pred_probs = lr_tf_l2.predict_proba(tf_test_features)

In [13]:
from sklearn.metrics import roc_auc_score
auc_score = roc_auc_score(test_labels, pred_probs[:,1])
auc_score

0.8678235303516929

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, pred_labels_tf_l2)

array([[  32,  106],
       [  12, 1802]], dtype=int64)

# Oversampling과 Undersampling 적용해 보기

In [15]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

오버샘플링

In [16]:
over_strategy = {0:4000, 1:16076}
smt1 = BorderlineSMOTE(sampling_strategy=over_strategy)
X_train1, y_train1 = smt1.fit_resample(tf_train_features, train_labels)
counter = Counter(y_train1)
print(counter)

Counter({1: 16076, 0: 4000})


언더샘플링

In [17]:
under_strategy = {0:4000, 1:8000}
undersample = RandomUnderSampler(sampling_strategy=under_strategy)
X_train2, y_train2 = undersample.fit_resample(X_train1, y_train1)
counter = Counter(y_train2)
print(counter)

Counter({1: 8000, 0: 4000})


In [18]:
lr_resample = LogisticRegression(C=0.1, penalty='l2', solver='saga', max_iter=10000)

In [19]:
lr_resample.fit(X_train2,y_train2)

LogisticRegression(C=0.1, max_iter=10000, solver='saga')

In [20]:
pred_labels_resample = lr_resample.predict(tf_test_features)

In [21]:
print(classification_report(test_labels, pred_labels_resample))

              precision    recall  f1-score   support

           0       0.39      0.57      0.46       138
           1       0.97      0.93      0.95      1814

    accuracy                           0.91      1952
   macro avg       0.68      0.75      0.71      1952
weighted avg       0.93      0.91      0.91      1952



In [22]:
pred_probs_resample = lr_resample.predict_proba(tf_test_features)
auc_score2 = roc_auc_score(test_labels, pred_probs_resample[:,1])
auc_score2

0.8504266334308039

In [23]:
confusion_matrix(test_labels, pred_labels_resample)

array([[  78,   60],
       [ 122, 1692]], dtype=int64)