# 🧠 Superapp Sentiment Model Trainer
This notebook will train Naive Bayes and SVM models on the Google Play Store user review dataset.

In [None]:

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Upload the dataset manually if running in Colab
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))

# Drop NA values in relevant columns
df.dropna(subset=["Translated_Review", "Sentiment"], inplace=True)

# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Sentiment"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["Translated_Review"], df["label"], test_size=0.2, random_state=42
)

# Vectorize with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vec, y_train)

# Save models and vectorizer
joblib.dump(nb_model, "naive_bayes_model.pkl")
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("✅ Models saved successfully.")


## ⬇️ Download Model Files

In [None]:

from google.colab import files
files.download("naive_bayes_model.pkl")
files.download("svm_model.pkl")
files.download("tfidf_vectorizer.pkl")


## 🤖 Fine-tune BERT for Sentiment Classification

In [None]:

!pip install transformers

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Tokenize input
def encode_data(texts, tokenizer, max_len=128):
    return tokenizer(
        texts.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='pt'
    )

train_encodings = encode_data(X_train, tokenizer)
test_encodings = encode_data(X_test, tokenizer)

train_labels = torch.tensor(y_train.tolist())
test_labels = torch.tensor(y_test.tolist())

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Train the model
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(2):  # Use 2 epochs for Colab runtime
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save BERT model
model.save_pretrained("bert-finetuned-superapp")
tokenizer.save_pretrained("bert-finetuned-superapp")
print("✅ BERT model saved to 'bert-finetuned-superapp'")


### 📦 Download BERT Model Folder

In [None]:

import shutil
shutil.make_archive("bert-finetuned-superapp", 'zip', "bert-finetuned-superapp")
files.download("bert-finetuned-superapp.zip")
