In [None]:
!pip install torch
!pip install transformers
!pip install tqdm
!pip install tiktoken
!pip install blobfile
!pip install sentencepiece


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install numpy==1.24.4 --force-reinstall


Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.4
    Uninstalling numpy-1.24.4:
      Successfully uninstalled numpy-1.24.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pymc 5.21.2 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.4 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
blosc2 3.2.1 requires numpy>=1.26, but you have numpy 1.24.4 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.4 which

In [None]:
!pip install --upgrade --force-reinstall gensim transformers


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting transformers
  Downloading transformers-4.51.1-py3-none-any.whl.metadata (38 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hu

In [None]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

In [None]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

# 1. Load the original dataset
df = pd.read_csv("sentiment_analysis_mahakumbh.csv")

# 2. Define target sample size per class
target_per_class = 8000

# 3. Count existing samples
count_pos = len(df[df['Sentiment Label'] == 'Positive'])
count_neg = len(df[df['Sentiment Label'] == 'Negative'])

# 4. Calculate how many more samples to generate
needed_pos = target_per_class - count_pos
needed_neg = target_per_class - count_neg

# 5. Sample half the number of texts (since each will generate 2 paraphrases)
df_pos = df[df['Sentiment Label'] == 'Positive'].sample(needed_pos // 2, random_state=42, replace=True)
df_neg = df[df['Sentiment Label'] == 'Negative'].sample(needed_neg // 2, random_state=42, replace=True)

# 6. Load paraphrasing model & tokenizer
model_name = "ramsrigouthamg/t5_paraphraser"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Enable mixed precision if GPU available
use_fp16 = device.type == "cuda"
if use_fp16:
    from torch.cuda.amp import autocast

# 7. Define paraphrasing function
def paraphrase_batch(texts, label, num_return_sequences=2, batch_size=16):
    rows = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Paraphrasing {label}"):
        batch_texts = texts[i:i+batch_size]
        prompts = ["paraphrase: " + t.strip() + " </s>" for t in batch_texts]
        enc = tokenizer(prompts, padding=True, truncation=True, max_length=60, return_tensors="pt")
        input_ids = enc.input_ids.to(device)
        attention_mask = enc.attention_mask.to(device)

        with torch.no_grad():
            if use_fp16:
                with autocast():
                    outputs = model.generate(
                        input_ids,
                        attention_mask=attention_mask,
                        max_length=60,
                        num_beams=4,
                        num_return_sequences=num_return_sequences,
                        temperature=0.8,
                        early_stopping=True
                    )
            else:
                outputs = model.generate(
                    input_ids,
                    attention_mask=attention_mask,
                    max_length=60,
                    num_beams=4,
                    num_return_sequences=num_return_sequences,
                    temperature=0.8,
                    early_stopping=True
                )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for sentence in decoded:
            rows.append({"text": sentence.strip(), "Sentiment Label": label})
    return pd.DataFrame(rows)

# 8. Generate synthetic data
synthetic_pos = paraphrase_batch(df_pos["text"].tolist(), "Positive", num_return_sequences=2)
synthetic_neg = paraphrase_batch(df_neg["text"].tolist(), "Negative", num_return_sequences=2)

# 9. Combine original and augmented data
df_final = pd.concat([df, synthetic_pos, synthetic_neg], ignore_index=True)

# 10. Save to CSV
df_final.to_csv("balanced_augmented_sentiment_mahakumbh.csv", index=False)
print("✅ Saved to balanced_augmented_sentiment_mahakumbh.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]


  with autocast():

Paraphrasing Positive:   8%|▊         | 1/12 [00:05<00:58,  5.31s/it][A
Paraphrasing Positive:  17%|█▋        | 2/12 [00:07<00:32,  3.22s/it][A
Paraphrasing Positive:  25%|██▌       | 3/12 [00:08<00:22,  2.55s/it][A
Paraphrasing Positive:  33%|███▎      | 4/12 [00:11<00:19,  2.42s/it][A
Paraphrasing Positive:  42%|████▏     | 5/12 [00:12<00:15,  2.16s/it][A
Paraphrasing Positive:  50%|█████     | 6/12 [00:14<00:12,  2.02s/it][A
Paraphrasing Positive:  58%|█████▊    | 7/12 [00:16<00:09,  1.92s/it][A
Paraphrasing Positive:  67%|██████▋   | 8/12 [00:17<00:07,  1.85s/it][A
Paraphrasing Positive:  75%|███████▌  | 9/12 [00:19<00:05,  1.81s/it][A
Paraphrasing Positive:  83%|████████▎ | 10/12 [00:21<00:03,  1.86s/it][A
Paraphrasing Positive:  92%|█████████▏| 11/12 [00:23<00:01,  1.93s/it][A
Paraphrasing Positive: 100%|██████████| 12/12 [00:25<00:00,  2.11s/it]
Paraphrasing Negative: 100%|██████████| 190/190 [05:54<00:00,  1.86s/it]

✅ Saved to balanced_augmented_sentiment_mahakumbh.csv





In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Load dataset
df = pd.read_csv("balanced_augmented_sentiment_mahakumbh.csv")
df.dropna(subset=["text", "Sentiment Label"], inplace=True)

# Label Encoding
label_enc = LabelEncoder()
df["label"] = label_enc.fit_transform(df["Sentiment Label"])

X = df["text"].values
y = df["label"].values

# K-Fold setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {
    "Logistic Regression": [],
    "Linear SVM": [],
    "Decision Tree": [],
    "Random Forest": [],
    "Gaussian Naive Bayes": []
}

fold = 1
for train_index, test_index in kf.split(X, y):
    print(f"\n--- Fold {fold} ---")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # TF-IDF
    tfidf = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    models_tfidf = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Linear SVM": LinearSVC(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=100),
        "Gaussian Naive Bayes": GaussianNB()
    }

    for name, model in models_tfidf.items():
        print(f"Training {name}...")
        if name == "Gaussian Naive Bayes":
            model.fit(X_train_tfidf.toarray(), y_train)
            y_pred = model.predict(X_test_tfidf.toarray())
        else:
            model.fit(X_train_tfidf, y_train)
            y_pred = model.predict(X_test_tfidf)
        acc = accuracy_score(y_test, y_pred)
        results[name].append(acc)

    fold += 1

# Average Results
results_avg = {model: np.mean(accs) for model, accs in results.items()}
results_df = pd.DataFrame(list(results_avg.items()), columns=["Model", "Average Accuracy"])
results_df.to_csv("kfold_non_xgb_model_mahakumbh_accuracies.csv", index=False)
print("\n✅ Saved to kfold_non_xgb_model_mahakumbh_accuracies.csv")
print(results_df)



--- Fold 1 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 2 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 3 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 4 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

--- Fold 5 ---
Training Logistic Regression...
Training Linear SVM...
Training Decision Tree...
Training Random Forest...
Training Gaussian Naive Bayes...

✅ Saved to kfold_non_xgb_model_mahakumbh_accuracies.csv
                  Model  Average Accuracy
0   Logistic Regression          0.995964
1            Linear SVM          0.996274
2         Decision Tree          0.99