<a href="https://colab.research.google.com/github/sleepyMS/Ai-Development-Collection/blob/main/Sentiment_analysis_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##필요한 라이브러리 및 설정

In [None]:
import numpy as np
import pandas as pd
import tweepy
from transformers import BertTokenizer, BertForSequenceClassification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
import torch
import datetime
import requests
import json

# 트위터 API 설정
auth = tweepy.OAuthHandler("API_KEY", "API_SECRET_KEY")
auth.set_access_token("ACCESS_TOKEN", "ACCESS_TOKEN_SECRET")
api = tweepy.API(auth)


##실시간 데이터 수집

In [None]:
# Binance API를 통한 실시간 암호화폐 데이터 수집
def get_crypto_data(symbol, interval='1m', limit=100):
    url = f'https://api.binance.com/api/v3/klines?symbol={symbol}&interval={interval}&limit={limit}'
    data = requests.get(url).json()
    df = pd.DataFrame(data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_asset_volume', 'trades', 'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.set_index('timestamp', inplace=True)
    return df[['open', 'high', 'low', 'close', 'volume']].astype(float)

# 트위터에서 실시간 데이터 수집
def get_tweets(keyword, count=100):
    tweets = tweepy.Cursor(api.search_tweets, q=keyword, lang="en").items(count)
    tweet_list = [tweet.text for tweet in tweets]
    return tweet_list

# 예시: 'Bitcoin' 관련 실시간 트윗과 실시간 Binance 데이터 수집
tweets = get_tweets('Bitcoin', 100)
crypto_data = get_crypto_data('BTCUSDT')

##데이터 전처리

In [None]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 텍스트 전처리 함수 (트윗)
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # URL 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 특수 문자 제거
    text = text.lower()  # 소문자 변환
    text = ' '.join([word for word in text.split() if word not in stop_words])  # 불용어 제거
    return text

# 트윗 전처리
preprocessed_tweets = [preprocess_text(tweet) for tweet in tweets]

# 시계열 데이터 정규화 (암호화폐 데이터)
scaler = MinMaxScaler(feature_range=(0, 1))
crypto_scaled = scaler.fit_transform(crypto_data['close'].values.reshape(-1,1))


##감정 분석 모델 (BERT)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# BERT 감정 분석 수행
def get_sentiment_scores(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    sentiment_scores = torch.argmax(outputs.logits, dim=-1)
    return sentiment_scores.numpy()

# 트윗의 감정 분석 결과 얻기
sentiment_scores = get_sentiment_scores(preprocessed_tweets)

##LSTM 모델 설계

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# LSTM 모델 구축
def build_lstm_model():
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(crypto_scaled.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# 학습 데이터를 위한 준비
def create_dataset(data, time_step=60):
    X, y = [], []
    for i in range(len(data)-time_step-1):
        X.append(data[i:(i+time_step), 0])
        y.append(data[i + time_step, 0])
    return np.array(X), np.array(y)

# LSTM 모델 학습
X_train, y_train = create_dataset(crypto_scaled)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

lstm_model = build_lstm_model()
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32)


##하이브리드 모델: LSTM과 BERT 결합

In [None]:
from sklearn.linear_model import LinearRegression

# 트랜스포머 감정 점수와 LSTM 예측 값 결합
def hybrid_model(sentiment_scores, lstm_predictions):
    sentiment_scores = sentiment_scores.reshape(-1, 1)
    lstm_predictions = lstm_predictions.reshape(-1, 1)
    combined_input = np.hstack((sentiment_scores, lstm_predictions))

    # 간단한 회귀 모델로 결합된 입력 학습
    reg_model = LinearRegression()
    reg_model.fit(combined_input, y_train)
    return reg_model

# LSTM 예측값 생성
lstm_predictions = lstm_model.predict(X_train)

# 하이브리드 모델 학습
hybrid_reg_model = hybrid_model(sentiment_scores[:len(lstm_predictions)], lstm_predictions)


##성능 평가 및 급등락 확률 계산

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# RMSE 및 R2 평가
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return rmse, r2

# LSTM 및 하이브리드 모델 평가
lstm_rmse, lstm_r2 = evaluate_model(y_train, lstm_predictions)
print(f'LSTM RMSE: {lstm_rmse}, R2: {lstm_r2}')

# 급등락 확률 계산
def calculate_volatility_probability(predictions, threshold=0.2):
    z_scores = (predictions - np.mean(predictions)) / np.std(predictions)
    return np.mean(z_scores > threshold)

volatility_prob = calculate_volatility_probability(lstm_predictions)
print(f'급등락 확률: {volatility_prob}')
