# 감성지수 만들기
- 긍정단어, 부정단어 점수화 

# 데이터 가져오기

In [1]:
import pandas as pd 
df = pd.read_csv("data/labeledTrainData.tsv", sep="\t")
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


# 데이터 전처리

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np

# 텍스트 전처리 함수
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # HTML 태그 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 특수문자 제거
    text = text.lower()
    return text

# 전처리 함수 적용
df['clean_review'] = df['review'].apply(clean_text)

# 벡터화
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['clean_review']).toarray()
y = df['sentiment'].values

array([1, 1, 0, ..., 0, 0, 1], shape=(25000,))

# 데이터셋 분리 

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터셋 클래스 정의
- 여기 부분이 가장 큰 난제가 될 것
- 코드 리뷰 할 시, Dataset 클래스 어떤 형태로 만들었는지, 꼭 공부

In [6]:
class ReviewDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.y)

In [9]:
# text 데이터를 tensor로 변환해서 PyTorch 입력값으로 업로드 할 준비 시키기
train_dataset = ReviewDataset(X_train, y_train)
val_dataset = ReviewDataset(X_val, y_val)

# 여기 부분은 transformers 라이브러리 비교
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# 모델 만들기

In [11]:
import torch.nn as nn

# 단순한 이진 분류 모델 (Logistic Regression)
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SentimentClassifier, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# 모델 인스턴스 생성
input_dim = X_train.shape[1]
model = SentimentClassifier(input_dim)

# 손실 함수 및 옵티마이저 설정
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 모델 학습

In [12]:
# GPU 사용 가능 시
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 학습 루프
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/5, Loss: 161.9813
Epoch 2/5, Loss: 117.9460
Epoch 3/5, Loss: 101.4444
Epoch 4/5, Loss: 91.9213
Epoch 5/5, Loss: 85.3685


# 모델 평가

In [14]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = (outputs >= 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(correct / total)

0.8816


# 각 단어별 가중치 확인

In [19]:
weights = model.linear.weight[0].detach().cpu().numpy() # GPU 텐서에서 CPU 텐서로 변환 ==> numpy 변환
vocab = vectorizer.get_feature_names_out()
word_weights = list(zip(vocab, weights))
word_weights.sort(key=lambda x: x[1], reverse=True)

In [22]:
# 긍정적인 단어
for word, weight in word_weights[:3]:
    print(word, weight)

excellent 0.5114949
favorite 0.48127627
superb 0.4601699


In [23]:
# 
for word, weight in word_weights[-3:]:
    print(word, weight)

awful -0.60033625
worst -0.67975783
waste -0.68052584
