In [None]:
!pip install torch
!pip install transformers
!pip install tqdm
!pip install tiktoken
!pip install blobfile
!pip install sentencepiece


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install numpy==1.24.4 --force-reinstall


Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.2.1 requires numpy>=1.26, but you have numpy 1.24.4 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
pymc 5.21.1 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.

In [None]:
!pip install --upgrade --force-reinstall gensim transformers


Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting transformers
  Using cached transformers-4.51.0-py3-none-any.whl.metadata (38 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting packaging>=20.0 (from transformers)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pyyaml>=5.1 (fro

In [None]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

In [None]:
# code to augment data
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

# 1. Load data
df = pd.read_csv("final_sentiment_analysis_with_labels.csv")
df_pos = df[df['Sentiment Label']=='Positive'].sample(12500, random_state=42, replace=True)
df_neu = df[df['Sentiment Label']=='Neutral'].sample(11667, random_state=42, replace=True)

# 2. Model & tokenizer
model_name = "ramsrigouthamg/t5_paraphraser"  # or a smaller fine‑tuned T5
tokenizer  = T5Tokenizer.from_pretrained(model_name)
model      = T5ForConditionalGeneration.from_pretrained(model_name)
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# enable mixed precision if on GPU
use_fp16 = device.type == "cuda"
if use_fp16:
    from torch.cuda.amp import autocast

# 3. Batched paraphrasing
def paraphrase_batch(texts, label, num_return_sequences=2, batch_size=16):
    rows = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Paraphrasing {label}"):
        batch_texts = texts[i:i+batch_size]
        prompts = ["paraphrase: " + t.strip() + " </s>" for t in batch_texts]
        enc = tokenizer(prompts, padding=True, truncation=True, max_length=60, return_tensors="pt")
        input_ids = enc.input_ids.to(device)
        attention_mask = enc.attention_mask.to(device)

        with torch.no_grad():
            if use_fp16:
                with autocast():
                    outputs = model.generate(
                        input_ids,
                        attention_mask=attention_mask,
                        max_length=60,
                        num_beams=4,                    # reduced beam size
                        num_return_sequences=num_return_sequences,
                        temperature=0.8,
                        early_stopping=True
                    )
            else:
                outputs = model.generate(
                    input_ids,
                    attention_mask=attention_mask,
                    max_length=60,
                    num_beams=4,
                    num_return_sequences=num_return_sequences,
                    temperature=0.8,
                    early_stopping=True
                )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for sentence in decoded:
            rows.append({"text": sentence.strip(), "Sentiment Label": label})
    return pd.DataFrame(rows)

# 4. Generate synthetic data
synthetic_pos = paraphrase_batch(df_pos["text"].tolist(), "Positive", num_return_sequences=2)
synthetic_neu = paraphrase_batch(df_neu["text"].tolist(), "Neutral",  num_return_sequences=3)

# 5. Combine & save
df_final = pd.concat([df, synthetic_pos, synthetic_neu], ignore_index=True)
df_final.to_csv("augmented_sentiment_dataset.csv", index=False)
print("Saved to augmented_sentiment_dataset.csv")


  with autocast():
Paraphrasing Positive: 100%|██████████| 782/782 [23:55<00:00,  1.84s/it]
Paraphrasing Neutral: 100%|██████████| 730/730 [23:17<00:00,  1.91s/it]


Saved to augmented_sentiment_dataset.csv


In [None]:
#model training
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from gensim.models import Word2Vec

# Load dataset
df = pd.read_csv("merged_augmented_sentiment_dataset.csv")
df.dropna(subset=["text", "Sentiment Label"], inplace=True)

# Label Encoding
label_enc = LabelEncoder()
df["label"] = label_enc.fit_transform(df["Sentiment Label"])

# Train/Val/Test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

results = {}

### TF-IDF Features ###
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

models_tfidf = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gaussian Naive Bayes": GaussianNB()
}

for name, model in models_tfidf.items():
    print(f"Training {name} with TF-IDF...")
    if name == "Gaussian Naive Bayes":
        model.fit(X_train_tfidf.toarray(), y_train)
        y_pred = model.predict(X_test_tfidf.toarray())
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

### Word2Vec + XGBoost ###
X_train_tok = [text.split() for text in X_train]
X_test_tok = [text.split() for text in X_test]

w2v_model = Word2Vec(sentences=X_train_tok, vector_size=100, window=5, min_count=2, workers=4)

def get_avg_w2v(tokens_list, model, vector_size=100):
    vectors = []
    for tokens in tokens_list:
        vectors.append(np.mean([model.wv[word] for word in tokens if word in model.wv] or [np.zeros(vector_size)], axis=0))
    return np.array(vectors)

X_train_w2v = get_avg_w2v(X_train_tok, w2v_model)
X_test_w2v = get_avg_w2v(X_test_tok, w2v_model)

print("Training XGBoost with Word2Vec features...")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_w2v, y_train)
y_pred_xgb = xgb_model.predict(X_test_w2v)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
results["XGBoost + Word2Vec"] = acc_xgb
print(f"XGBoost + Word2Vec Accuracy: {acc_xgb:.4f}")

### Save to CSV ###
results_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
results_df.to_csv("non_bert_model_accuracies.csv", index=False)
print("\nSaved to non_bert_model_accuracies.csv")
print(results_df)


Training Logistic Regression with TF-IDF...
Logistic Regression Accuracy: 0.7224
Training Linear SVM with TF-IDF...
Linear SVM Accuracy: 0.7234
Training Decision Tree with TF-IDF...
Decision Tree Accuracy: 0.7880
Training Random Forest with TF-IDF...
Random Forest Accuracy: 0.8465
Training Gaussian Naive Bayes with TF-IDF...
Gaussian Naive Bayes Accuracy: 0.5801
Training XGBoost with Word2Vec features...
XGBoost + Word2Vec Accuracy: 0.6859

Saved to non_bert_model_accuracies.csv
                  Model  Accuracy
0   Logistic Regression  0.722411
1            Linear SVM  0.723421
2         Decision Tree  0.788011
3         Random Forest  0.846542
4  Gaussian Naive Bayes  0.580102
5    XGBoost + Word2Vec  0.685935


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Load dataset
df = pd.read_csv("merged_augmented_sentiment_dataset.csv")
df.dropna(subset=["text", "Sentiment Label"], inplace=True)

# Label Encoding
label_enc = LabelEncoder()
df["label"] = label_enc.fit_transform(df["Sentiment Label"])

X = df["text"].values
y = df["label"].values

# K-Fold setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {
    "Logistic Regression": [],
    "Linear SVM": [],
    "Decision Tree": [],
    "Random Forest": [],
    "Gaussian Naive Bayes": []
}

fold = 1
for train_index, test_index in kf.split(X, y):
    print(f"\n--- Fold {fold} ---")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # TF-IDF
    tfidf = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    models_tfidf = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Linear SVM": LinearSVC(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=100),
        "Gaussian Naive Bayes": GaussianNB()
    }

    for name, model in models_tfidf.items():
        print(f"Training {name}...")
        if name == "Gaussian Naive Bayes":
            model.fit(X_train_tfidf.toarray(), y_train)
            y_pred = model.predict(X_test_tfidf.toarray())
        else:
            model.fit(X_train_tfidf, y_train)
            y_pred = model.predict(X_test_tfidf)
        acc = accuracy_score(y_test, y_pred)
        results[name].append(acc)

    fold += 1

# Average Results
results_avg = {model: np.mean(accs) for model, accs in results.items()}
results_df = pd.DataFrame(list(results_avg.items()), columns=["Model", "Average Accuracy"])
results_df.to_csv("kfold_non_xgb_model_accuracies.csv", index=False)
print("\nSaved to kfold_non_xgb_model_accuracies.csv")
print(results_df)



--- Fold 1 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 2 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 3 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 4 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 5 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

Saved to kfold_non_xgb_model_accuracies.csv
                  Model  Average Accuracy
0   Logistic Regression          0.730451
1            Linear SVM          0.731339
2         Decision Tree          0.803049
3      

In [None]:
# Install required packages
!pip install -U transformers
!pip install -U datasets



In [None]:
#bert
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("merged_augmented_sentiment_dataset.csv")
df.dropna(subset=["text", "Sentiment Label"], inplace=True)

# Encode sentiment labels
label_enc = LabelEncoder()
df["label"] = label_enc.fit_transform(df["Sentiment Label"])

# Train-validation-test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Custom Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare tokenizer and datasets
print("Preparing tokenizer and datasets...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
val_dataset = SentimentDataset(X_val, y_val, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define training arguments
# Removed evaluation_strategy, logging_strategy, and save_strategy as they might
# not be available in your version of transformers.
training_args = TrainingArguments(
    output_dir='./bert_results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs'  # Kept logging_dir for basic logging
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train
print("Training BERT...")
trainer.train()

# Evaluate on test set
print("Evaluating BERT on test set...")
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
acc = accuracy_score(y_test, y_pred)
print(f"BERT Test Accuracy: {acc:.4f}")

# Save results
pd.DataFrame([["BERT", acc]], columns=["Model", "Accuracy"]).to_csv("bert_model_accuracy.csv", index=False)
print("Saved to bert_model_accuracy.csv")

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313502 sha256=d6b3850584707644ba37d34a7408af75b119ab1e43fb0f43cc7dcad9e4a70acd
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
#xgboost
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import fasttext

df = pd.read_csv("merged_augmented_sentiment_dataset.csv")
df.dropna(subset=["text", "Sentiment Label"], inplace=True)

label_enc = LabelEncoder()
df["label"] = label_enc.fit_transform(df["Sentiment Label"])

X = df["text"].tolist()
y = df["label"].tolist()

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ================== LOAD FASTTEXT ==================
print("Loading fastText model from Google Drive...")
ft_model_path = "/content/drive/MyDrive/cc.en.300.bin"
ft = fasttext.load_model(ft_model_path)

# ================== 1. Doc2Vec ==================
print("Training Doc2Vec...")
tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(X_train)]
doc2vec_model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=2, workers=4, epochs=40)

def get_doc2vec_embeddings(texts, model):
    return [model.infer_vector(text.split()) for text in texts]

X_train_doc2vec = get_doc2vec_embeddings(X_train, doc2vec_model)
X_test_doc2vec = get_doc2vec_embeddings(X_test, doc2vec_model)

print("Training XGBoost on Doc2Vec...")
xgb_doc2vec = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
xgb_doc2vec.fit(X_train_doc2vec, y_train)
y_pred_doc2vec = xgb_doc2vec.predict(X_test_doc2vec)
acc_doc2vec = accuracy_score(y_test, y_pred_doc2vec)

# ================== 2. fastText ==================
print("Generating fastText embeddings...")
def get_fasttext_avg_vector(texts, ft_model):
    vectors = []
    for text in texts:
        words = text.split()
        vecs = [ft_model.get_word_vector(word) for word in words if word.isalpha()]
        vectors.append(np.mean(vecs, axis=0) if vecs else np.zeros(ft_model.get_dimension()))
    return np.array(vectors)

X_train_ft = get_fasttext_avg_vector(X_train, ft)
X_test_ft = get_fasttext_avg_vector(X_test, ft)

print("Training XGBoost on fastText...")
xgb_ft = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
xgb_ft.fit(X_train_ft, y_train)
y_pred_ft = xgb_ft.predict(X_test_ft)
acc_ft = accuracy_score(y_test, y_pred_ft)

# ================== SAVE RESULTS ==================
results_df = pd.DataFrame([
    ["XGBoost + Doc2Vec", acc_doc2vec],
    ["XGBoost + fastText", acc_ft]
], columns=["Model", "Accuracy"])

results_df.to_csv("xgboost_doc2vec_fasttext_results.csv", index=False)

print("\n✅ Accuracy Results:")
print(results_df)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading fastText model from Google Drive...
Training Doc2Vec...
Training XGBoost on Doc2Vec...
Generating fastText embeddings...
Training XGBoost on fastText...

✅ Accuracy Results:
                Model  Accuracy
0   XGBoost + Doc2Vec  0.667677
1  XGBoost + fastText  0.335272


In [None]:
import pandas as pd

# Load your file (assuming it's saved as CSV already, update path if needed)
df = pd.read_csv("combined_model_accuracies.csv")

# Merge 'Average A' and 'Accuracy' columns into one 'Accuracy' column
df['Accuracy'] = df['Average Accuracy'].combine_first(df['Accuracy'])

# Drop the old 'Average A' column if it exists
df = df[['Model', 'Accuracy']]

# Save cleaned version
df.to_csv("cleaned_model_accuracies.csv", index=False)

print("✅ Cleaned file saved as 'cleaned_model_accuracies.csv'")
print(df)


✅ Cleaned file saved as 'cleaned_model_accuracies.csv'
                  Model  Accuracy
0   Logistic Regression  0.730451
1            Linear SVM  0.731339
2         Decision Tree  0.803049
3         Random Forest  0.858302
4  Gaussian Naive Bayes  0.592137
5     XGBoost + Doc2Vec  0.667677
6    XGBoost + fastText  0.335272
