In [1]:
# Colab cell 1 - installing deps
!pip install -q transformers datasets accelerate scikit-learn joblib flask_ngrok

import torch
print("Torch:", torch.__version__, "CUDA available:", torch.cuda.is_available())


Torch: 2.9.0+cu126 CUDA available: False


In [2]:
#mounting Drive (optional but recommended so models persist)
from google.colab import drive
drive.mount('/content/drive')

MODEL_BASE = "/content/drive/MyDrive/smartwatch_sentiment/models"
import os
os.makedirs(MODEL_BASE, exist_ok=True)
print("Models will be saved to:", MODEL_BASE)


Mounted at /content/drive
Models will be saved to: /content/drive/MyDrive/smartwatch_sentiment/models


In [1]:
from google.colab import files
import pandas as pd
#Uploading the Excel file
uploaded = files.upload()
#Getting the uploaded file name (e.g., data.xlsx)
file_name = list(uploaded.keys())[0]
print("Uploaded file:", file_name)
#Reading the Excel file into a DataFrame
df = pd.read_excel(file_name)
#Displaying the data
print(df.head())
print(df.shape)


ModuleNotFoundError: No module named 'google'

In [8]:
# Colab cell 4 - quick prep
SHORT_COL = 'short_comment'   # change if different
LONG_COL  = 'long_comment'    # change if different
LABEL_COL = 'sentiment'       # change if different

# Make sure those columns exist
print("Columns:", df.columns.tolist())

# Create combined text
df['combined_text'] = df.get(SHORT_COL, "").fillna("") + " " + df.get(LONG_COL, "").fillna("")
df = df[['combined_text', LABEL_COL]].dropna().reset_index(drop=True)
df[LABEL_COL] = df[LABEL_COL].astype(str).str.lower().str.strip()  # normalize labels

df.head()


Columns: ['short_comment', 'long_comment', 'sentiment']


Unnamed: 0,combined_text,sentiment
0,Good üëç Good,positive
1,Good Product Nice Product...But Cost is very H...,positive
2,Bluetooth connectivity issues didn't connect t...,negative
3,Report Translated from Indonesian by Amazon,neutral
4,Watch Switch defective Item defective and not ...,negative


In [9]:
# Colab cell 5 - classical training
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib, os

X = df['combined_text'].values
y = df[LABEL_COL].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
print("Classical accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save vectorizer + model together in a dict
MODEL_BASE = os.environ.get("MODEL_BASE", "/content/drive/MyDrive/smartwatch_sentiment/models")
os.makedirs(MODEL_BASE, exist_ok=True)
CLASSICAL_OUT = os.path.join(MODEL_BASE, "classical_model.joblib")
joblib.dump({"vectorizer": vectorizer, "clf": clf}, CLASSICAL_OUT)
print("Saved classical model to:", CLASSICAL_OUT)


Classical accuracy: 0.8666666666666667
              precision    recall  f1-score   support

    negative       1.00      0.69      0.82        13
     neutral       1.00      1.00      1.00         2
    positive       0.79      1.00      0.88        15

    accuracy                           0.87        30
   macro avg       0.93      0.90      0.90        30
weighted avg       0.89      0.87      0.86        30

Saved classical model to: /content/drive/MyDrive/smartwatch_sentiment/models/classical_model.joblib


In [10]:
# Colab cell 6 - use HF pipeline (SST-2 model) for evaluation on the same test set
from transformers import pipeline
bert_pipe = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Map HF labels to lowercase for comparison
def hf_label_to_standard(hf_label):
    return hf_label.lower()

# Evaluate on a small subset (do not evaluate too large in Colab free)
sample_texts = X_test[:200]  # adjust size if you want
hf_preds = [hf_label_to_standard(bert_pipe(t[:512])[0]['label']) for t in sample_texts]
# If your dataset labels are 'positive'/'negative' use same naming, else adapt
print("Sample HF preds:", hf_preds[:10])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Sample HF preds: ['positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive']


In [11]:
# Colab cell 7 - helpful: list saved files
!ls -la /content/drive/MyDrive/smartwatch_sentiment/models


total 140
-rw------- 1 root root 142841 Nov 30 08:07 classical_model.joblib


In [14]:
import joblib

joblib.dump({"vectorizer": vectorizer, "clf": clf}, "tfidf_model.pkl")
print("Saved tfidf_model.pkl")


Saved tfidf_model.pkl


In [17]:
from google.colab import files
import joblib
import zipfile
import os

# -----------------------------------------------------------
# 1) DOWNLOAD classical model (TF-IDF + Logistic)
# -----------------------------------------------------------
# Example: Save model (if not saved already)
# joblib.dump({"vectorizer": vectorizer, "clf": clf}, "tfidf_model.pkl")

files.download("tfidf_model.pkl")
print("Downloaded tfidf_model.pkl")


# ============================================================
# 1. DOWNLOAD AND SAVE BERT MODEL LOCALLY
# ============================================================
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import zipfile
import os
from google.colab import files

MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
SAVE_DIR = "bert_model"

# Create folder
os.makedirs(SAVE_DIR, exist_ok=True)

# Download tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Save to folder
tokenizer.save_pretrained(SAVE_DIR)
model.save_pretrained(SAVE_DIR)

print("BERT model saved in folder:", SAVE_DIR)


# ============================================================
# 2. CHECK CONTENTS (IMPORTANT)
# ============================================================
print("Files inside bert_model/:")
print(os.listdir(SAVE_DIR))


# ============================================================
# 3. ZIP THE MODEL FOLDER CORRECTLY
# ============================================================
ZIP_NAME = "bert_model.zip"

with zipfile.ZipFile(ZIP_NAME, "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files_in_dir in os.walk(SAVE_DIR):
        for file in files_in_dir:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, SAVE_DIR)
            zipf.write(file_path, arcname)

print("Zipped as:", ZIP_NAME)


# ============================================================
# 4. DOWNLOAD ZIP TO YOUR LAPTOP
# ============================================================
files.download(ZIP_NAME)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded tfidf_model.pkl
BERT model saved in folder: bert_model
Files inside bert_model/:
['tokenizer.json', 'model.safetensors', 'vocab.txt', 'special_tokens_map.json', 'tokenizer_config.json', 'config.json']
Zipped as: bert_model.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>