In [3]:
!pip install transformers torch --quiet

In [1]:
import boto3
import pandas as pd
import io

# Инициализируем клиента S3
s3 = boto3.client('s3')

# Параметры
bucket_name = 'aws-nlp-project'
key = 'cleaned_csv/part-00000-7c869441-b25d-4d84-8189-4bdfe52112a4-c000.snappy.parquet'

# Считываем Parquet-файл из S3
obj = s3.get_object(Bucket=bucket_name, Key=key)
data = io.BytesIO(obj['Body'].read())

# Чтение parquet-файла в pandas DataFrame
df = pd.read_parquet(data, engine='pyarrow')  # или engine='fastparquet'

# Проверка
print(df.head())


           created_utc                                          full_text  \
0  2025-03-08T14:47:17  Time to Shake Things Up in Our SubGot Ideas Sh...   
1  2025-04-18T07:50:00  of ultrasound examinations performed by traine...   
2  2025-04-17T11:02:16  This College Protester Isnt Real Its an AIPowe...   
3  2025-04-17T15:37:28  What are some of your biggest fears regarding ...   
4  2025-04-17T19:56:52  An AI bot just used the name I use on other pl...   

                                      filtered_words  
0  [Time, Shake, Things, SubGot, Ideas, Share, Th...  
1  [ultrasound, examinations, performed, trained,...  
2  [College, Protester, Isnt, Real, AIPowered, Un...  
3  [biggest, fears, regarding, exponential, growt...  
4  [AI, bot, used, name, use, platforms, previous...  


In [4]:
from transformers import pipeline

# 1. Sentiment (Positive/Negative)
sentiment_pipeline = pipeline("sentiment-analysis")

# 2. Emotion classifier (joy, anger, sadness, fear, etc.)
emotion_pipeline = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=False
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [5]:
# Убедись, что колонка full_text есть
texts = df['full_text'].astype(str).tolist()

# Классическая тональность
sentiment_results = sentiment_pipeline(texts, truncation=True)

# Эмоции
emotion_results = emotion_pipeline(texts, truncation=True)

# Добавим в DataFrame
df['sentiment_label'] = [r['label'] for r in sentiment_results]
df['sentiment_score'] = [r['score'] for r in sentiment_results]

df['emotion_label'] = [r['label'] for r in emotion_results]
df['emotion_score'] = [r['score'] for r in emotion_results]

In [6]:
df.head()

Unnamed: 0,created_utc,full_text,filtered_words,sentiment_label,sentiment_score,emotion_label,emotion_score
0,2025-03-08T14:47:17,Time to Shake Things Up in Our SubGot Ideas Sh...,"[Time, Shake, Things, SubGot, Ideas, Share, Th...",POSITIVE,0.997226,neutral,0.46173
1,2025-04-18T07:50:00,of ultrasound examinations performed by traine...,"[ultrasound, examinations, performed, trained,...",NEGATIVE,0.991565,neutral,0.472881
2,2025-04-17T11:02:16,This College Protester Isnt Real Its an AIPowe...,"[College, Protester, Isnt, Real, AIPowered, Un...",NEGATIVE,0.998037,anger,0.4438
3,2025-04-17T15:37:28,What are some of your biggest fears regarding ...,"[biggest, fears, regarding, exponential, growt...",NEGATIVE,0.987808,fear,0.983329
4,2025-04-17T19:56:52,An AI bot just used the name I use on other pl...,"[AI, bot, used, name, use, platforms, previous...",NEGATIVE,0.998116,neutral,0.78796


In [8]:
df.to_csv("reddit_sentiments.csv", index=False)

In [9]:
s3 = boto3.client("s3")
bucket_name = "aws-nlp-project"

# Загрузка файла в нужную папку S3
s3.upload_file("reddit_sentiments.csv", bucket_name, "sentiment_output/reddit_sentiments.csv")