# 최종 코드

In [None]:

# 코멘트 한문장씩 파싱
import ast
import pandas as pd
from collections import OrderedDict
from transformers import pipeline, TFAutoModelForSequenceClassification, AutoTokenizer

def expand_comments_post(df, stock_name):
    expanded_comments = []
    for comment in df['Comments']:
        # 문자열의 앞뒤 대괄호 제거
        comment = comment.strip('[]')
        
        expanded_comment = ast.literal_eval(comment)
        for c in expanded_comment:
            sentences = c.split('. ')
            expanded_comments.extend(sentences)
    
    repeated_indices = df.index.repeat(df['Comments'].apply(lambda x: sum(len(c.split('. ')) for c in ast.literal_eval(x.strip('[]')))))
    
    expanded_df = pd.DataFrame({
        'date': pd.to_datetime(df.loc[repeated_indices, 'Created Time']).reset_index(drop=True),
        'stock': stock_name,
        'comment': expanded_comments
    })
    
    return expanded_df

# 레딧 전체 함수
def reddit_posts(stock_name): 
    file_path = f'/Users/jiheelee/Desktop/2024-1/Stock-info-archive/data/reddit_post_name/reddit_posts_{stock_name}.csv'
    data = pd.read_csv(file_path)
    expanded_data = expand_comments_post(data, stock_name)
    
    comments = expanded_data['comment'].tolist()
    unique_comments = list(OrderedDict.fromkeys(comments))
    final_data = expanded_data.drop_duplicates(subset=['comment'])
    
    # 모델과 토크나이저 로드
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

    # TensorFlow 기반의 감정 분석 파이프라인 생성
    sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, framework='tf')
    
    # 결과를 저장할 리스트
    scores = []
    sent = []

    # comments 리스트를 250개씩 나누어 함수 실행 후 결과 저장
    batch_size = 5
    
    texts = list(OrderedDict.fromkeys(comments))
    for i in range(0, len(comments), batch_size):
        batch = unique_comments[i:i + batch_size]
        results = sentiment_pipeline(batch)
        for text, result in zip(texts, results):
            scores.append(result['score'])
            sent.append(result['label'])
            # print(f"Text: {text}\nSentiment: {result['label']}, Score: {result['score']}\n")
    
    result = pd.DataFrame({'date': final_data['date'], 'stock': stock_name, 'sent': sent, 'score': scores})
    
    return result

def expand_comments_key(df, stock_name):
    expanded_comments = []
    for comment in df['Comments']:
        # 문자열의 앞뒤 대괄호 제거
        comment = comment.strip('[]')
        
        expanded_comment = ast.literal_eval(comment)
        for c in expanded_comment:
            sentences = c.split('. ')
            expanded_comments.extend(sentences)
    
    repeated_indices = df.index.repeat(df['Comments'].apply(lambda x: sum(len(c.split('. ')) for c in ast.literal_eval(x.strip('[]')))))
    
    expanded_df = pd.DataFrame({
        'date': pd.to_datetime(df.loc[repeated_indices, 'Created Time']).reset_index(drop=True),
        'stock': stock_name,
        'query': df.loc[repeated_indices, 'Query'].reset_index(drop=True),
        'comment': expanded_comments
    })
    
    return expanded_df


def reddit_keys(stock_name): 
    file_path = f'/Users/jiheelee/Desktop/2024-1/Stock-info-archive/data/reddit_post_keyword/reddit_posts_{stock_name}.csv'
    data = pd.read_csv(file_path)
    expanded_data = expand_comments_key(data, stock_name)
    expanded_data['comment'] = expanded_data['comment'].apply(lambda x: x[:350] if len(x) > 350 else x)
    
    comments = expanded_data['comment'].tolist()
    unique_comments = list(OrderedDict.fromkeys(comments))
    final_data = expanded_data.drop_duplicates(subset=['comment'])
    
    # 모델과 토크나이저 로드
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

    # TensorFlow 기반의 감정 분석 파이프라인 생성
    sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, framework='tf')
    
    # 결과를 저장할 리스트
    scores = []
    sent = []

    # comments 리스트를 250개씩 나누어 함수 실행 후 결과 저장
    batch_size = 5
    
    texts = list(OrderedDict.fromkeys(comments))
    for i in range(0, len(comments), batch_size):
        batch = unique_comments[i:i + batch_size]
        results = sentiment_pipeline(batch)
        for text, result in zip(texts, results):
            scores.append(result['score'])
            sent.append(result['label'])
            # print(f"Text: {text}\nSentiment: {result['label']}, Score: {result['score']}\n")
    
    result = pd.DataFrame({'date': final_data['date'], 'stock': stock_name, 'sent': sent, 'score': scores})
    
    return result



In [None]:
# reddit_posts_AAPL = reddit_posts('AAPL')
# reddit_posts_GOOGL = reddit_posts('GOOGL')
# reddit_posts_AMZN = reddit_posts('AMZN')
# reddit_posts_META = reddit_posts('META')
# reddit_posts_MSFT = reddit_posts('MSFT')
# reddit_posts_NVDA = reddit_posts('NVDA')
# reddit_posts_TSLA = reddit_posts('TSLA')
# reddit_keys_AAPL = reddit_keys('AAPL')
# reddit_keys_GOOGL = reddit_keys('GOOGL')
# reddit_keys_AMZN = reddit_keys('AMZN')
# reddit_keys_META = reddit_keys('META')
# reddit_keys_MSFT = reddit_keys('MSFT')
# reddit_keys_NVDA = reddit_keys('NVDA')
# reddit_keys_TSLA = reddit_keys('TSLA')

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipe

In [None]:
# reddit_posts_AAPL.to_csv('result/reddit_post_AAPL_result.csv')
# reddit_posts_AMZN.to_csv('result/reddit_post_AMZN_result.csv')
# reddit_posts_GOOGL.to_csv('result/reddit_post_GOOGL_result.csv')
# reddit_posts_META.to_csv('result/reddit_post_META_result.csv')
# reddit_posts_MSFT.to_csv('result/reddit_post_MSFT_result.csv')
# reddit_posts_NVDA.to_csv('result/reddit_post_NVDA_result.csv')
# reddit_posts_TSLA.to_csv('result/reddit_post_TSLA_result.csv')
# reddit_keys_AAPL.to_csv('result/reddit_key_AAPL_result.csv')
# reddit_keys_AMZN.to_csv('result/reddit_key_AMZN_result.csv')
# reddit_keys_GOOGL.to_csv('result/reddit_key_GOOGL_result.csv')
# reddit_keys_META.to_csv('result/reddit_key_META_result.csv')
# reddit_keys_MSFT.to_csv('result/reddit_key_MSFT_result.csv')
# reddit_keys_NVDA.to_csv('result/reddit_key_NVDA_result.csv')
# reddit_keys_TSLA.to_csv('result/reddit_key_TSLA_result.csv')