# 대중 리뷰 데이터 감정분석

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk

In [2]:
df = pd.read_csv("cleaned_imdb_reviews.csv")

# 데이터 크기 및 미리보기
print("데이터 크기:", df.shape)
df.head()

데이터 크기: (50000, 3)


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod hook right ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


In [None]:
# 라벨 분포 확인
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print("\n라벨 분포:")
print(df['label'].value_counts())


라벨 분포:
label
1    25000
0    25000
Name: count, dtype: int64


In [None]:
# 특성과 라벨 정의
sentences = df['cleaned_review']
labels = df['label']

# NaN 제거
df_clean = df[['cleaned_review', 'label']].dropna(subset=['cleaned_review', 'label']).reset_index(drop=True)
sentences = df_clean['cleaned_review'].astype(str).tolist()
labels = df_clean['label'].tolist()

# 텍스트 토큰화
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Word2Vec 모델 학습
model_word2vec = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [5]:
# 문장 임베딩
def sentence_embedding(sentence, model):
    tokens = word_tokenize(sentence.lower())
    embedding = np.zeros(model.vector_size)
    valid_tokens = 0
    for token in tokens:
        if token in model.wv:
            embedding += model.wv[token]
            valid_tokens += 1
    if valid_tokens > 0:
        embedding /= valid_tokens
    return embedding

embedding_word2vec = np.array([sentence_embedding(sentence, model_word2vec) for sentence in sentences])

# 훈련 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(embedding_word2vec, labels, test_size=0.2, stratify=labels, random_state=42)

# 랜덤 포레스트 분류기 학습 및 평가
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Word2Vec 기반 대중리뷰 분류 정확도:", accuracy)

Word2Vec 기반 대중리뷰 분류 정확도: 0.8433


# 평론가 리뷰 데이터 감정분석

In [6]:
# 평론가 데이터 불러오기
new_df = pd.read_csv("cleaned_rotten_tomatoes_critic_reviews.csv")
print("데이터 크기:", new_df.shape)
new_df.head()

데이터 크기: (1064211, 3)


Unnamed: 0,review_type,review_content,cleaned_review
0,Fresh,A fantasy adventure that fuses Greek mythology...,fantasi adventur fuse greek mytholog contempor...
1,Fresh,"Uma Thurman as Medusa, the gorgon with a coiff...",uma thurman medusa gorgon coiffur writh snake ...
2,Fresh,With a top-notch cast and dazzling special eff...,top notch cast dazzl special effect tide teen ...
3,Fresh,Whether audiences will get behind The Lightnin...,whether audienc get behind lightn thief hard p...
4,Rotten,What's really lacking in The Lightning Thief i...,realli lack lightn thief genuin sens wonder th...


In [17]:
# 라벨 분포 확인
new_df['label'] = new_df['review_type'].map({'Fresh': 1, 'Rotten': 0})
print("\n라벨 분포:")
print(new_df['label'].value_counts())


라벨 분포:
label
1    681035
0    383176
Name: count, dtype: int64


In [16]:
# NaN 제거
new_df_clean = new_df[['label', 'cleaned_review', 'review_content']].dropna(subset=['label', 'cleaned_review', 'review_content']).reset_index(drop=True)

# 특성과 라벨 정의
new_sentences = new_df_clean['cleaned_review'].astype(str).tolist()
new_labels = new_df_clean['label'].tolist()

# 텍스트 토큰화
new_tokenized_sentences = [word_tokenize(new_sentence.lower()) for new_sentence in new_sentences]

# Word2Vec 모델 학습
new_model_word2vec = Word2Vec(sentences=new_tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [12]:
# 문장 임베딩
new_embedding_word2vec = np.array([sentence_embedding(new_sentence, new_model_word2vec) for new_sentence in new_sentences])

# 훈련 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(new_embedding_word2vec, new_labels, test_size=0.2, stratify=new_labels, random_state=42)

# 랜덤 포레스트 분류기 학습 및 평가
new_rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
new_rf_classifier.fit(X_train, y_train)

y_pred = new_rf_classifier.predict(X_test)
new_accuracy = accuracy_score(y_test, y_pred)
print("Word2Vec 기반 평론가리뷰 분류 정확도:", new_accuracy)

Word2Vec 기반 평론가리뷰 분류 정확도: 0.741499859035805


# 대중리뷰 학습모델을 통한 평론가리뷰 감정분석

In [18]:
# 문장 임베딩
new_embeddings = np.array([sentence_embedding(s, model_word2vec) for s in new_sentences])

# 감정 예측 (기존 학습된 rf_classifier 사용)
predictions = rf_classifier.predict(new_embeddings)

# 결과 저장
new_df_clean["predicted_sentiment"] = predictions
new_df_clean["label"] = new_df_clean["label"].map({0: "Rotten", 1: "Fresh"})
new_df_clean["predicted_sentiment_label"] = new_df_clean["predicted_sentiment"].map({0: "Negative", 1: "Positive"})

# 결과 보기
new_df_clean = new_df_clean[['label', 'predicted_sentiment_label', 'review_content']]
print(new_df_clean.head())

    label predicted_sentiment_label  \
0   Fresh                  Positive   
1   Fresh                  Positive   
2   Fresh                  Positive   
3   Fresh                  Positive   
4  Rotten                  Positive   

                                                                                                                                                                                                          review_content  
0                                       A fantasy adventure that fuses Greek mythology to contemporary American places and values. Anyone around 15 (give or take a couple of years) will thrill to the visual spectacle  
1                                                              Uma Thurman as Medusa, the gorgon with a coiffure of writhing snakes and stone-inducing hypnotic gaze is one of the highlights of this bewitching fantasy  
2                                                                                              With a top-no

In [21]:
# 컬럼명 공백 제거
new_df_clean.columns = new_df_clean.columns.str.strip()

# Fresh-Positive, Rotten-Negative 일치 여부 계산
new_df_clean['match'] = (
    ((new_df_clean['label'] == 'Fresh') & (new_df_clean['predicted_sentiment_label'] == 'Positive')) |
    ((new_df_clean['label'] == 'Rotten') & (new_df_clean['predicted_sentiment_label'] == 'Negative'))
)
match_count = new_df_clean['match'].sum()
mismatch_count = len(new_df_clean) - match_count

print(f"일치 개수: {match_count}개 ({match_count / len(new_df_clean) * 100:.2f}%)")
print(f"불일치 개수: {mismatch_count}개 ({mismatch_count / len(new_df_clean) * 100:.2f}%)\n")

# 불일치 목록
mismatched_df = new_df_clean[~new_df_clean['match']]
mismatched_df = mismatched_df[['match', 'label', 'predicted_sentiment_label', 'review_content']]

pd.set_option('display.max_colwidth', 100)
print(mismatched_df.head())

# CSV로 저장
# mismatched_df.to_csv("rotten_tomatoes_mismatched_reviews.csv", index=False)

일치 개수: 732336개 (68.82%)
불일치 개수: 331762개 (31.18%)

    match   label predicted_sentiment_label  \
4   False  Rotten                  Positive   
5   False  Rotten                  Positive   
12  False  Rotten                  Positive   
14  False  Rotten                  Positive   
19  False  Rotten                  Positive   

                                                                                         review_content  
4   What's really lacking in The Lightning Thief is a genuine sense of wonder, the same thing that b...  
5              It's more a list of ingredients than a movie-magic potion to enjoy from start to finish.  
12  Chris Columbus returns to his comfort zone for this mirthless, episodic fantasy saga based on th...  
14                              This cast is simply too generic. None of the young thespians stick out.  
19  When the movie slows down to catch its breath, there's very little of the heart and soul -- and ...  
