In [57]:
# 1. 데이터 로드
import nltk
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NLTK 리소스 다운로드
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import twitter_samples
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
# 긍정 및 부정 트윗 로드
pos_tweets = twitter_samples.strings('positive_tweets.json')[:1000]
neg_tweets = twitter_samples.strings('negative_tweets.json')[:1000]

print("긍정 트윗 개수:", len(pos_tweets))
print("부정 트윗 개수:", len(neg_tweets))
print("\n긍정 트윗 예시:\n", pos_tweets[0])
print("\n부정 트윗 예시:\n", neg_tweets[0])

긍정 트윗 개수: 1000
부정 트윗 개수: 1000

긍정 트윗 예시:
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

부정 트윗 예시:
 hopeless for tmr :(


In [59]:
# 2. 텍스트 전처리
def preprocess_text(text):
    # 소문자화
    text = text.lower()

    # URL 제거
    text = re.sub(r'http\S+', '', text)

    # 특수문자, 숫자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 연속된 공백 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 축약어 확장
    text = text.replace("n't", " not")
    text = text.replace("'m", " am")
    text = text.replace("'s", " is")
    text = text.replace("'re", " are")
    text = text.replace("'ll", " will")
    text = text.replace("wanna", "want to")
    text = text.replace("gonna", "going to")

    return text

# 전처리 적용
pos_cleaned = [preprocess_text(tweet) for tweet in pos_tweets]
neg_cleaned = [preprocess_text(tweet) for tweet in neg_tweets]

print("원본 트윗:\n", pos_tweets[0])
print("\n전처리된 트윗:\n", pos_cleaned[0])

원본 트윗:
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

전처리된 트윗:
 followfriday franceinte pkuchly milipolparis for being top engaged members in my community this week


In [60]:
# 3. 토큰화
# 샘플 5개 문장 토큰화
sample_sentences = pos_cleaned[:5]

# 문장 토큰화
sent_tokens = [sent_tokenize(sent) for sent in sample_sentences]

# 단어 토큰화
word_tokens = [word_tokenize(sent[0]) for sent in sent_tokens]

print("문장 토큰화 결과:")
for i, tokens in enumerate(sent_tokens, 1):
    print(f"{i}. {tokens}")

print("\n단어 토큰화 결과 (첫 10개 토큰):")
for i, tokens in enumerate(word_tokens, 1):
    print(f"{i}. {tokens[:10]}")

문장 토큰화 결과:
1. ['followfriday franceinte pkuchly milipolparis for being top engaged members in my community this week']
2. ['lambja hey james how odd please call our contact centre on and we will be able to assist you many thanks']
3. ['despiteofficial we had a listen last night as you bleed is an amazing track when are you in scotland']
4. ['sides congrats']
5. ['yeaaaah yippppy my accnt verified rqst has succeed got a blue tick mark on my fb profile in days']

단어 토큰화 결과 (첫 10개 토큰):
1. ['followfriday', 'franceinte', 'pkuchly', 'milipolparis', 'for', 'being', 'top', 'engaged', 'members', 'in']
2. ['lambja', 'hey', 'james', 'how', 'odd', 'please', 'call', 'our', 'contact', 'centre']
3. ['despiteofficial', 'we', 'had', 'a', 'listen', 'last', 'night', 'as', 'you', 'bleed']
4. ['sides', 'congrats']
5. ['yeaaaah', 'yippppy', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a']


In [61]:
# 4. 불용어 제거
# NLTK 영어 불용어 로드
stop_words = set(stopwords.words('english'))

# 추가 불용어 정의
additional_stopwords = {
    'rt', 'via', 'omg', 'lol', 'wow',
    'wanna', 'gonna', 'gotta', 'ain\'t', 'y\'all', 'im', 'u', 'jnlazts', 'amp'}

stop_words.update(additional_stopwords)

# 불용어 제거 함수
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# 불용어 제거 적용
pos_tokens = [remove_stopwords(word_tokenize(tweet)) for tweet in pos_cleaned]
neg_tokens = [remove_stopwords(word_tokenize(tweet)) for tweet in neg_cleaned]

print("불용어 제거 전 (첫 번째 트윗):", word_tokenize(pos_cleaned[0]))
print("불용어 제거 후 (첫 번째 트윗):", pos_tokens[0])

불용어 제거 전 (첫 번째 트윗): ['followfriday', 'franceinte', 'pkuchly', 'milipolparis', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week']
불용어 제거 후 (첫 번째 트윗): ['followfriday', 'franceinte', 'pkuchly', 'milipolparis', 'top', 'engaged', 'members', 'community', 'week']


In [62]:
# 5. 단어 사전 구축 (Vocabulary)
from collections import Counter

# 전체 토큰 병합
all_tokens = [token for tokens in pos_tokens + neg_tokens for token in tokens]

# 단어 빈도수 계산
word_freq = Counter(all_tokens)

# 상위 5,000개 단어 선택
vocab = {word: idx for idx, (word, _) in enumerate(word_freq.most_common(5000), 1)}

# 결과 출력
print("단어 사전 크기:", len(vocab))
print("\n상위 20개 단어와 빈도:")
for word, freq in word_freq.most_common(20):
    print(f"{word}: {freq}")

단어 사전 크기: 5000

상위 20개 단어와 빈도:
follow: 127
like: 82
love: 76
want: 74
thanks: 72
dont: 72
cant: 72
back: 70
good: 67
get: 62
time: 60
know: 60
day: 58
hi: 53
one: 50
see: 50
going: 48
miss: 48
thank: 47
lt: 46


In [63]:
# 6. 정수 인코딩과 패딩
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 정수 인코딩 함수
def encode_tokens(tokens):
    return [vocab.get(token, 0) for token in tokens]

# 정수 인코딩 적용
pos_encoded = [encode_tokens(tokens) for tokens in pos_tokens]
neg_encoded = [encode_tokens(tokens) for tokens in neg_tokens]

# 패딩
max_len = 50
pos_padded = pad_sequences(pos_encoded, maxlen=max_len, padding='post')
neg_padded = pad_sequences(neg_encoded, maxlen=max_len, padding='post')

print("패딩 전 첫 번째 트윗:", pos_encoded[0])
print("패딩 후 첫 번째 트윗:", pos_padded[0])

패딩 전 첫 번째 트윗: [63, 1267, 1268, 1269, 44, 294, 295, 51, 33]
패딩 후 첫 번째 트윗: [  63 1267 1268 1269   44  294  295   51   33    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [64]:
# 7. 벡터화
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 원본 텍스트 재결합
pos_texts = [' '.join(tokens) for tokens in pos_tokens]
neg_texts = [' '.join(tokens) for tokens in neg_tokens]

# Bag-of-Words
bow_vectorizer = CountVectorizer(max_features=1000)
bow_matrix = bow_vectorizer.fit_transform(pos_texts + neg_texts)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(pos_texts + neg_texts)

print("Bag-of-Words 희소 행렬 형태:", bow_matrix.shape)
print("TF-IDF 희소 행렬 형태:", tfidf_matrix.shape)

Bag-of-Words 희소 행렬 형태: (2000, 1000)
TF-IDF 희소 행렬 형태: (2000, 1000)


In [65]:
# 8, 9. Bigram 모델 실습
from collections import Counter

# 긍정/부정 토큰 병합
pos_all_tokens = [token for tokens in pos_tokens for token in tokens]
neg_all_tokens = [token for tokens in neg_tokens for token in tokens]

# Unigram, Bigram 카운트
unigram_pos = Counter(pos_all_tokens)
bigram_pos = Counter(zip(pos_all_tokens[:-1], pos_all_tokens[1:]))
unigram_neg = Counter(neg_all_tokens)
bigram_neg = Counter(zip(neg_all_tokens[:-1], neg_all_tokens[1:]))

# Laplace 스무딩 적용
alpha = 1
V_pos = len(unigram_pos)
V_neg = len(unigram_neg)

# 예시 조건부 확률 계산
def calculate_smoothed_prob(bigram, unigram, prev_word, next_word, V):
    return (bigram[(prev_word, next_word)] + alpha) / (unigram[prev_word] + alpha * V)

# 상위 10개 Bigram 비교
print("Top 10 Positive Bigrams:", bigram_pos.most_common(10))
print("Top 10 Negative Bigrams:", bigram_neg.most_common(10))

Top 10 Positive Bigrams: [(('follow', 'back'), 42), (('follow', 'follow'), 37), (('community', 'week'), 22), (('hi', 'bam'), 13), (('bam', 'barsandmelody'), 13), (('barsandmelody', 'follow'), 13), (('follow', 'bestfriend'), 13), (('bestfriend', 'horan'), 13), (('horan', 'loves'), 13), (('loves', 'lot'), 13)]
Top 10 Negative Bigrams: [(('want', 'go'), 11), (('dont', 'want'), 11), (('dont', 'know'), 11), (('uniteblue', 'tcot'), 8), (('feel', 'bad'), 6), (('goodbye', 'stage'), 6), (('cant', 'sleep'), 6), (('ice', 'cream'), 6), (('go', 'home'), 5), (('climatechange', 'cc'), 5)]
