In [10]:
!pip install -q transformers datasets torch sentence-transformers nltk spacy faker prophet streamlit
import nltk
nltk.download('stopwords')
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
from faker import Faker
import pandas as pd, random
fake = Faker()
rows = []
for i in range(1200):
    text = fake.sentence(nb_words=random.randint(6,30))
    r = random.random()
    if r < 0.12:
        text += " Delivery was late and support didn't respond."
    elif r < 0.22:
        text += " Product quality was poor."
    elif r < 0.33:
        text += " Very satisfied with the purchase."
    rows.append({'id': i, 'feedback': text})
df = pd.DataFrame(rows)
df.to_csv('/content/feedback_raw.csv', index=False)
print("Saved /content/feedback_raw.csv  — rows:", len(df))
df.head(5)

Saved /content/feedback_raw.csv  — rows: 1200


Unnamed: 0,id,feedback
0,0,Whose senior total firm capital bed throw dark...
1,1,Oil five wife something view beautiful actuall...
2,2,Coach fish pass wear enough section letter inf...
3,3,Family why trial maybe almost foot.
4,4,Court per PM Mr go beautiful audience head blu...


In [12]:
import re, pandas as pd
import spacy
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")
stop = set(stopwords.words('english'))

df = pd.read_csv('/content/feedback_raw.csv')

def preprocess(text):
    text = str(text).strip()
    text = re.sub(r'[^A-Za-z0-9 ,.?!]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    doc = nlp(text.lower())
    tokens = [t.lemma_ for t in doc if not t.is_punct and not t.is_space and t.lemma_ not in stop]
    return " ".join(tokens)

df['clean'] = df['feedback'].apply(preprocess)
df = df.drop_duplicates(subset=['clean']).reset_index(drop=True)
df.to_csv('/content/feedback_cleaned.csv', index=False)
print("Saved /content/feedback_cleaned.csv — rows:", len(df))
df.head(5)

Saved /content/feedback_cleaned.csv — rows: 1200


Unnamed: 0,id,feedback,clean
0,0,Whose senior total firm capital bed throw dark...,whose senior total firm capital bed throw dark...
1,1,Oil five wife something view beautiful actuall...,oil five wife something view beautiful actuall...
2,2,Coach fish pass wear enough section letter inf...,coach fish pass wear enough section letter inf...
3,3,Family why trial maybe almost foot.,family trial maybe almost foot
4,4,Court per PM Mr go beautiful audience head blu...,court per pm mr go beautiful audience head blu...


In [13]:
import numpy as np, pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

df = pd.read_csv('/content/feedback_cleaned.csv')
np.random.seed(42)
labels = np.random.choice([0,1,2], size=len(df), p=[0.3,0.3,0.4])
ds = Dataset.from_pandas(pd.DataFrame({'text': df['clean'], 'label': labels}))
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tok(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

ds = ds.map(tok, batched=True)
ds = ds.train_test_split(test_size=0.15, seed=42)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

training_args = TrainingArguments(output_dir='/content/distilbert_demo', num_train_epochs=1, per_device_train_batch_size=16, logging_steps=20, save_strategy="no")
trainer = Trainer(model=model, args=training_args, train_dataset=ds['train'], eval_dataset=ds['test'])
trainer.train()
model.save_pretrained('/content/distilbert_demo')
tokenizer.save_pretrained('/content/distilbert_demo')
print("Model saved to /content/distilbert_demo")

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
20,1.1052
40,1.0839
60,1.0953


Model saved to /content/distilbert_demo


In [14]:
import pickle, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
class SentimentWrapper:
    def __init__(self, model_dir):
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    def predict(self, texts):
        inputs = self.tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
        with torch.no_grad():
            logits = self.model(**inputs).logits
        preds = logits.argmax(dim=-1).tolist()
        return preds

w = SentimentWrapper('/content/distilbert_demo')
pickle.dump(w, open('/content/sentiment_model.pkl','wb'))
print("Saved /content/sentiment_model.pkl")

Saved /content/sentiment_model.pkl


In [15]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

def summarize(text, max_len=40):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors='pt', truncation=True, max_length=512)
    out = model.generate(input_ids, max_length=max_len, num_beams=3, early_stopping=True)
    return tokenizer.decode(out[0], skip_special_tokens=True)

print(summarize("The product arrived late and packaging was damaged. Customer support took long to reply and issue not resolved. I am unhappy."))

the product arrived late and packaging was damaged. customer support took long to reply and issue not resolved.


In [16]:
import pandas as pd
from collections import Counter
df = pd.read_csv('/content/feedback_cleaned.csv')
all_words = " ".join(df['clean']).split()
counter = Counter(all_words)
top10 = counter.most_common(10)
print("Top 10 tokens:", top10)

import numpy as np
dates = pd.date_range(end=pd.Timestamp.today(), periods=90)
np.random.seed(0)
scores = np.clip((np.sin(np.linspace(0,6,90)) * 5 + 70 + np.random.randn(90)*3), 40, 100)
df_ts = pd.DataFrame({'ds': dates, 'y': scores})
df_ts.to_csv('/content/simulated_scores.csv', index=False)
print("Saved /content/simulated_scores.csv — sample:")
df_ts.tail()

Top 10 tokens: [('late', 170), ('respond', 170), ('support', 166), ('satisfied', 151), ('purchase', 151), ('delivery', 146), ('quality', 134), ('product', 132), ('poor', 127), ('think', 49)]
Saved /content/simulated_scores.csv — sample:


Unnamed: 0,ds,y
85,2025-10-26 10:33:54.890631,73.062101
86,2025-10-27 10:33:54.890631,71.203384
87,2025-10-28 10:33:54.890631,67.430482
88,2025-10-29 10:33:54.890631,65.07043
89,2025-10-30 10:33:54.890631,71.766278
