In [None]:
import pandas as pd
import numpy as np
import torch
import transformers as tfm
import spacy

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load the English stopwords
stopwords = set(stopwords.words('english'))

# Load the news category dataset
df = pd.read_json('/content/Sample_News_Category.json', lines=True)

# Preprocess the text data
df['text'] = df['headline'] + ". " + df['short_description']
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace('[^\w\s]','')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

# Extract aspects using spaCy
nlp = spacy.load("en_core_web_sm")
df['aspects'] = df['text'].apply(lambda x: [chunk.text for chunk in nlp(x).noun_chunks])

# Tokenize the text using BERT
tokenizer = tfm.BertTokenizer.from_pretrained('bert-base-uncased')
encoded_data = tokenizer.batch_encode_plus(
    df['text'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding=True, 
    truncation=True,
    max_length=256, 
    return_tensors='pt'
)

# Load the pre-trained BERT model
model = tfm.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
optimizer = tfm.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Predict the sentiment for each aspect
sentiments = []
for i, row in df.iterrows():
    aspects = row['aspects']
    text = row['text']
    input_ids = encoded_data['input_ids'][i]
    attention_mask = encoded_data['attention_mask'][i]
    aspect_sentiments = []
    for aspect in aspects:
        aspect_tokens = tokenizer.tokenize(aspect)
        aspect_ids = tokenizer.convert_tokens_to_ids(aspect_tokens)
        aspect_ids = torch.tensor(aspect_ids).unsqueeze(0)
        aspect_mask = torch.ones_like(aspect_ids)
        with torch.no_grad():
            outputs = model(aspect_ids, attention_mask=aspect_mask)
            logits = outputs[0]
            sentiment = np.argmax(logits.detach().numpy())
            aspect_sentiments.append(sentiment)
    overall_sentiment = np.mean(aspect_sentiments)
    sentiments.append(overall_sentiment)

# Add the sentiments to the DataFrame
df['sentiment'] = sentiments


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  df['text'] = df['text'].str.replace('[^\w\s]','')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
for item in df['aspects']:

  print(item)

['4 million americans', 'sleeves', 'covid boosters health experts', 'demand', '171 million doses', 'us', 'fall']
['american airlines flyer', 'banned life punching flight attendant video subdued passengers crew', 'back aircraft confrontation', 'us attorneys office los angeles']
['23 funniest tweets cats dogs', '1723 dog']
['funniest tweets parents', 'grownup toothpaste toddlers', 'toothbrush', 'teeth carolina reaper', 'tabasco sauce']
['woman', 'cops', 'black birdwatcher', 'lawsuit exemployer amy cooper', 'investment firm franklin templeton', 'branding racist video central park encounter']
['cleaner dead belk bathroom 4 days body', 'police 63yearold woman', 'south carolina store', 'dead monday family', 'missing authorities']
['reporter', 'adorable surprise boyfriend', 'live tv', 'who', 'anchor new yorks pix11', 'journalist michelle ross']
['puerto ricans', 'water hurricane fionas', 'half million people', 'water service', 'three days storm', 'us territory']
['complexity child immigrants 

In [None]:
for item in df['sentiment']:

  print(item)

1.5714285714285714
1.5
2.0
1.4
1.6666666666666667
1.2
1.1666666666666667
1.0
2.0
1.4
1.6
1.4
1.4
1.6666666666666667
1.1666666666666667
1.7142857142857142
1.5
1.4
1.1666666666666667
1.8
1.3333333333333333
1.6
1.2
2.0
1.6
1.3333333333333333
1.25
1.6
1.8
1.6666666666666667
1.625
1.6666666666666667
1.5
1.3333333333333333
1.0
1.25
1.6666666666666667
1.3333333333333333
1.25
2.0
1.4285714285714286
1.5714285714285714
1.3333333333333333
1.75
1.5
1.4
1.8333333333333333
1.0
1.2857142857142858
1.6


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [None]:
for text in df['text']:

  print(text)

4 million americans roll sleeves omicrontargeted covid boosters health experts said early predict whether demand would match 171 million doses new boosters us ordered fall
american airlines flyer charged banned life punching flight attendant video subdued passengers crew fled back aircraft confrontation according us attorneys office los angeles
23 funniest tweets cats dogs week sept 1723 dog dont understand could eaten
funniest tweets parents week sept 1723 accidentally put grownup toothpaste toddlers toothbrush screamed like cleaning teeth carolina reaper dipped tabasco sauce
woman called cops black birdwatcher loses lawsuit exemployer amy cooper accused investment firm franklin templeton unfairly firing branding racist video central park encounter went viral
cleaner dead belk bathroom 4 days body found police 63yearold woman seen working south carolina store thursday found dead monday family reported missing authorities said
reporter gets adorable surprise boyfriend live tv whos behi