In [33]:
!pip install datasets



In [34]:
import pandas as pd
import torch
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoModel, AutoTokenizer, AutoModelForSequenceClassification

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
data = pd.read_csv('/content/news_data_sentence.csv', encoding='latin-1')
data.head()

Unnamed: 0,sentences,aspects,labels
0,"Ben Horowitz, A16z cofounder, said in July tha...",Donald Trump,2
1,An email to A16z employees now reveals that he...,Donald Trump,0
2,The investor wrote that he and his wife will b...,Donald Trump,0
3,Sign up to get the inside scoop on todayâs b...,Donald Trump,1
4,Read preview Thanks for signing up!,Donald Trump,1


In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentences  1000 non-null   object
 1   aspects    1000 non-null   object
 2   labels     1000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


In [54]:
for i in range(21, 30):
  print(data.loc[i, 'sentences'])

Read the email below:
Ben Horowitz, A16z cofounder, said in July that he plans to support former President Donald Trump's campaign.
An email to A16z employees now reveals that he plans to donate to Vice President Kamala Harris.
The investor wrote that he and his wife will be making a "significant donation" to support Harris.
Sign up to get the inside scoop on todayâs biggest stories in markets, tech, and business â delivered daily.
Read preview Thanks for signing up!
Go to newsletter preferences Thanks for signing up!
Access your favorite topics in a personalized feed while you're on the go.
download the app Email address Sign up By clicking âSign Upâ, you accept our Terms of Service and Privacy Policy .


In [55]:
data['sentences'] = data['sentences'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data['sentences'] = data['sentences'].apply(lambda x: x.lower())

In [56]:
for i in range(21, 30):
  print(data.loc[i, 'sentences'])

read the email below
ben horowitz a16z cofounder said in july that he plans to support former president donald trumps campaign
an email to a16z employees now reveals that he plans to donate to vice president kamala harris
the investor wrote that he and his wife will be making a significant donation to support harris
sign up to get the inside scoop on todayâs biggest stories in markets tech and business â delivered daily
read preview thanks for signing up
go to newsletter preferences thanks for signing up
access your favorite topics in a personalized feed while youre on the go
download the app email address sign up by clicking âsign upâ you accept our terms of service and privacy policy 


In [57]:
from torch.utils.data import Dataset

class AspectSentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text, aspect, sentiment = item["text"], item["aspect"], item["sentiment"]
        inputs = self.tokenizer(
            f"{text} [SEP] {aspect}",
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(sentiment, dtype=torch.long),
        }

# Create dataset
# dataset = AspectSentimentDataset(dataset, tokenizer, max_length=128)

def compute_metrics(p):
    predictions, labels = p
    # Convert logits to predicted class
    pred = predictions.argmax(axis=-1)
    # Compute precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')

    # Return the metrics in a dictionary
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [58]:
from transformers import AutoTokenizer

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
dataset = []

for index, row in data.iloc[:1000].iterrows():
    text = row['sentences']
    aspect = row['aspects']
    sentiment = row['labels']
    dataset.append({'text': text, 'aspect': aspect, 'sentiment': sentiment})

In [60]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, random_split

# Split data into train (80%), validation (10%), and test (10%)
train_data, temp_data = train_test_split(dataset, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_dataset = AspectSentimentDataset(train_data, tokenizer, max_length=512)
val_dataset = AspectSentimentDataset(val_data, tokenizer, max_length=512)
test_dataset = AspectSentimentDataset(test_data, tokenizer, max_length=512)

len(train_data), len(val_data), len(test_data)

(800, 100, 100)

In [61]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    # weight_decay=0.01,
    learning_rate=5e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.0576,0.959,0.624429,0.61,0.614993
2,0.8066,0.86777,0.614676,0.6,0.604354
3,0.5964,0.840031,0.650883,0.65,0.650355


TrainOutput(global_step=75, training_loss=0.8328425852457683, metrics={'train_runtime': 260.3213, 'train_samples_per_second': 9.219, 'train_steps_per_second': 0.288, 'total_flos': 631472202547200.0, 'train_loss': 0.8328425852457683, 'epoch': 3.0})

In [62]:
# After training, you can evaluate the model on the test set
eval_results = trainer.evaluate(test_dataset)
print(f"Precision: {eval_results['eval_precision']}")
print(f"Recall: {eval_results['eval_recall']}")
print(f"F1 Score: {eval_results['eval_f1']}")

Precision: 0.7462507934675977
Recall: 0.74
F1 Score: 0.7403808095952025


Beberapa percobaan yang dilakukan:

1400 data batch 32 epoch 10<br>
Precision: 0.5802468487394958 <br>
Recall: 0.5785714285714286 <br>
F1 Score: 0.5783197754626326 <br><br>

1700 data batch 32 epoch 10<br>
Precision: 0.6016708072629126 <br>
Recall: 0.5847953216374269 <br>
F1 Score: 0.5887521620953793 <br><br>

2000 data batch 32 epoch 10<br>
Precision: 0.6556288936627283 <br>
Recall: 0.655 <br>
F1 Score: 0.6532557759863087 <br><br>

2000 data batch 16 epoch 3 <br>
Precision: 0.6617769871106337 <br>
Recall: 0.65 <br>
F1 Score: 0.6496530784538384 <br><br>

2000 data batch 16 epoch 3 weight decay 0.01<br>
Precision: 0.6634589160839162 <br>
Recall: 0.66 <br>
F1 Score: 0.6608911976911976 <br><br>

1000 data batch 32 epoch 3 weight decay 0.01 <br>
Precision: 0.7163939061688499 <br>
Recall: 0.71 <br>
F1 Score: 0.710528635682159 <br><br>

best model <br>
1000 data batch 32 epoch 3 <br>
Precision: 0.7462507934675977 <br>
Recall: 0.74 <br>
F1 Score: 0.7403808095952025 <br><br>





In [63]:
trainer.save_model("./sentiment_analysis")
tokenizer.save_pretrained("./sentiment_analysis")

('./sentiment_analysis/tokenizer_config.json',
 './sentiment_analysis/special_tokens_map.json',
 './sentiment_analysis/vocab.txt',
 './sentiment_analysis/added_tokens.json')

# Predict using finetuned model

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch

model_name = "/content/best model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define function to predict sentiment for a single sentence
def predict_sentence(sentences, aspect, max_length=128):
    results = []
    for sentence in sentences:
        # Tokenize the sentence with aspect
        inputs = tokenizer(
            f"{sentence} [SEP] {aspect}",
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt",
        )

        # Make prediction
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_label = torch.argmax(logits, dim=-1).item()
            results.append(predicted_label)

    return results

In [65]:
from collections import Counter

# Aggregate results using majority voting
def aggregate_predictions(predictions):
    # Count frequency of each sentiment
    sentiment_counts = Counter(predictions)
    # Find the most frequent sentiment
    document_sentiment = sentiment_counts.most_common(1)[0][0]
    return document_sentiment

# Define function to predict sentiment for a document
def predict_document(text, aspect):
    result_dict = {0: 'Negative',
                   1: 'Neutral',
                   2: 'Positive'}

    # Split the text into sentences
    sentences = nltk.sent_tokenize(text)

    # Get sentence-level predictions
    sentence_predictions = predict_sentence(sentences, aspect)
    sentence_prediction_map = [result_dict[x] for x in sentence_predictions]

    # Aggregate sentence-level predictions to document-level
    document_sentiment = aggregate_predictions(sentence_predictions)

    return {
        "sentence_predictions": sentence_prediction_map,
        "document_sentiment": result_dict[document_sentiment],
    }


In [70]:
# Example news text and aspect
news_text = """
Donald Trump has demonstrated a strong ability to connect with a significant portion of the American electorate, using his business acumen and direct communication style to resonate with many voters. His administration's focus on deregulation and tax reform received praise from supporters for spurring economic growth in certain sectors.

On the other hand, Kamala Harris has faced criticism for perceived inconsistencies in her policy positions, which some argue undermine her credibility. Her record as a prosecutor has been scrutinized for decisions that critics say disproportionately impacted marginalized communities. Additionally, detractors have pointed to her communication style as occasionally lacking clarity, potentially causing misunderstandings about her policy intentions.
"""

# aspect = "Donald Trump"
aspect = "Kamala Harris"

# Predict sentiment for the document
result = predict_document(news_text, aspect)
print(f"Aspect: {aspect}")
print("Sentence-Level Predictions:", result["sentence_predictions"])
print("News-Level Sentiment:", result["document_sentiment"])


Aspect: Kamala Harris
Sentence-Level Predictions: ['Positive', 'Positive', 'Negative', 'Negative', 'Negative']
News-Level Sentiment: Negative
