In [None]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

df = pd.read_csv("/content/final_train.csv")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        words = word_tokenize(text)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return ' '.join(words)
    else:
        return ''

df['cleaned_post_body'] = df['Post Body'].apply(preprocess_text)
df.to_csv('lemmatized_final_train.csv', index=False)

print("Preprocessing complete! Results saved to 'lemmatized_final_train.csv'.")

Preprocessing complete! Results saved to 'lemmatized_final_train.csv'.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
import pandas as pd
from transformers import pipeline

df = pd.read_csv('lemmatized_final_train.csv')

if 'label' not in df.columns:
    df['label'] = df['cleaned_post_body'].apply(lambda x: 'positive' if isinstance(x, str) and 'good' in x else 'negative')

models = {
    "bert-base-uncased": "BERT",
    "roberta-base": "RoBERTa",
    "distilbert-base-uncased-finetuned-sst-2-english": "DistilBERT"
}

accuracy_results = {}

for model_name in models:
    classifier = pipeline("sentiment-analysis", model=model_name)
    filtered_df = df[df['cleaned_post_body'].apply(lambda x: isinstance(x, str) and len(x) > 0)]
    predictions = classifier(filtered_df['cleaned_post_body'].tolist())
    pred_labels = [1 if pred['label'] == 'POSITIVE' else 0 for pred in predictions]
    true_labels = filtered_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()
    correct = sum(true == pred for true, pred in zip(true_labels, pred_labels))
    accuracy = correct / len(true_labels) if len(true_labels) > 0 else 0
    accuracy_results[model_name] = accuracy
    print(f"{model_name} accuracy: {accuracy:.4f}")

best_model = max(accuracy_results, key=accuracy_results.get)
print(f"\nBest model: {best_model} with accuracy {accuracy_results[best_model]:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert-base-uncased accuracy: 0.9444
roberta-base accuracy: 0.9444
distilbert-base-uncased-finetuned-sst-2-english accuracy: 0.6889

Best model: bert-base-uncased with accuracy 0.9444


In [None]:
import pandas as pd
from transformers import pipeline

df = pd.read_csv('lemmatized_final_train.csv')

classifier = pipeline('sentiment-analysis', model="bert-base-uncased")

df['sentiment'] = df['cleaned_post_body'].apply(lambda x: classifier(x)[0]['label'] if isinstance(x, str) and x else None)

df.to_csv('bert_sentiment_results.csv', index=False)

print("Sentiment analysis complete! Results saved to 'bert_sentiment_results.csv'.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentiment analysis complete! Results saved to 'bert_sentiment_results.csv'.
