# Import Libraries

Installs necessary libraries like praw (Reddit API), google-api-python-client (YouTube API), pytrends (Google Trends API), python-dotenv (for environment variables).
Imports required Python libraries for data processing (pandas, re, nltk, etc.).
Installs and imports machine learning libraries (transformers, torch, datasets, scikit-learn).
Uses DistilBERT from Hugging Face for sentiment analysis.

In [1]:
!pip install praw
!pip install google-api-python-client
!pip install pytrends
!pip install python-dotenv
import praw
import pandas as pd
from googleapiclient.discovery import build
from pytrends.request import TrendReq
import re
import nltk
import os
from nltk.corpus import stopwords
!pip install pandas nltk transformers torch datasets scikit-learn seaborn
!pip install transformers[torch] accelerate -U
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import transformers
import torch
import accelerate
from nltk.corpus import stopwords
import numpy as np
import torch
from torch.utils.data import Dataset

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable




##### Checks and prints the versions of transformers, torch, and accelerate to ensure compatibility

In [2]:
print("Transformers Version:", transformers.__version__)
print("Torch Version:", torch.__version__)
print("Accelerate Version:", accelerate.__version__)

Transformers Version: 4.48.3
Torch Version: 2.6.0+cpu
Accelerate Version: 1.3.0


##### Loads API credentials from the .env file.
##### Prints environment variables to verify that the credentials are loaded correctly.

In [None]:
from dotenv import load_dotenv
load_dotenv()
print(os.getenv("REDDIT_CLIENT_ID"))
print(os.getenv("REDDIT_CLIENT_ID"))
print(os.getenv("REDDIT_CLIENT_SECRET"))
print(os.getenv("REDDIT_USER_AGENT"))
print(os.getenv("YOUTUBE_API_KEY"))

# Reddit
##### Initializes the Reddit API client in read-only mode.
##### Defines get_reddit_posts() function to fetch top posts from specific subreddits related to smartwatches.
##### Stores retrieved data in a Pandas DataFrame with columns title, score, and url

In [4]:
# Reddit API Credentials
reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT")
)

print(reddit.read_only) 

def get_reddit_posts(subreddits, query="smartwatch", limit=1500):
    """Fetch top posts from multiple subreddits"""
    posts = []
    
    for subreddit in subreddits:
        for post in reddit.subreddit(subreddit).search(query, sort="hot", limit=limit):
            posts.append([post.title, post.score, post.url])
    
    df = pd.DataFrame(posts, columns=["title", "score", "url"])
    return df

# Fetch Reddit smartwatch discussions
subreddits = ["smartwatch", "wearables", "watches"]
reddit_data = get_reddit_posts(subreddits)
print(reddit_data.head())

True
                                               title  score  \
0                              Suggest me smartwatch      2   
1  What is the best Smartwatch for iPhone widely ...      2   
2                        Smartwatch for working out?      2   
3            Is there a market for old Smartwatches?      4   
4             Best smartwatch for $150 - $200 budget      2   

                                                 url  
0  https://www.reddit.com/r/smartwatch/comments/1...  
1  https://www.reddit.com/r/smartwatch/comments/1...  
2  https://www.reddit.com/r/smartwatch/comments/1...  
3             https://www.reddit.com/gallery/1ijcxxm  
4  https://www.reddit.com/r/smartwatch/comments/1...  


# Fetch YouTube Trending Videos

##### Uses YouTube Data API to fetch trending videos related to smartwatches.
##### Stores video_id, title, and channel name in a Pandas DataFrame.


In [5]:
API_KEY = os.getenv("YOUTUBE_API_KEY")

youtube = build("youtube", "v3", developerKey=API_KEY)

def get_youtube_trending(query="smartwatch", max_results=1000):
    """Fetch trending YouTube videos related to smartwatches"""
    request = youtube.search().list(
        q=query, part="snippet", maxResults=max_results, type="video"
    )
    response = request.execute()

    videos = []
    for item in response["items"]:
        video_id = item["id"]["videoId"]
        title = item["snippet"]["title"]
        channel = item["snippet"]["channelTitle"]
        videos.append([video_id, title, channel])

    return pd.DataFrame(videos, columns=["video_id", "title", "channel"])

# Fetch YouTube trending smartwatch videos
youtube_data = get_youtube_trending()
print(youtube_data.head())

      video_id                                              title  \
0  7ieKOKwUYoo  Smartwatches von Apple, Samsung, Huawei &amp; ...   
1  oFZ2nSozoTs  😍Sim + WiFi 4G LTE Android Watch Tk4 Ultra Fir...   
2  BxoSN7FMe-4  Die Besten Smartwatches für Sport und Fitness!...   
3  F2DDlM7LIb0  How to add apple logo in smart watch series 7 ...   
4  DUFBsdGmZPY  Apple Watch Ultra waterproof test #shorts #app...   

             channel  
0     ARD Marktcheck  
1  Perfect Gadget BD  
2      Sport Technik  
3    Trend Yourstyle  
4             iWatch  


# Clean the Data

##### Downloads NLTK stopwords.
##### Defines clean_text() function to remove URLs, special characters, and stopwords.
##### Cleans title text from Reddit and YouTube datasets.

In [6]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    """Remove URLs, special characters, and stopwords from text"""
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z ]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = " ".join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Apply cleaning function
reddit_data["title"] = reddit_data["title"].apply(clean_text)
youtube_data["title"] = youtube_data["title"].apply(clean_text)

[nltk_data] Downloading package stopwords to C:\Users\Shahzad
[nltk_data]     Iqbal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Merging csv

##### Merges Reddit and YouTube data.
##### Saves the final dataset as a CSV file in the user’s Downloads folder.

In [7]:
# Merge data from all sources
final_data = pd.concat([
    reddit_data.assign(source="Reddit"),
    youtube_data.assign(source="YouTube")
], ignore_index=True)

# Save to CSV
username = os.getlogin()

# Define the Downloads folder path
downloads_folder = f"C:\\Users\\{username}\\Downloads"

# Define full file path
file_path = os.path.join(downloads_folder, "trending_smartwatch_data.csv")

# Save CSV file to Downloads folder
final_data.to_csv(file_path, index=False)

print(f"All data saved to: {file_path}")
print("All data saved to trending_smartwatch_data.csv!")

All data saved to: C:\Users\Shahzad Iqbal\Downloads\trending_smartwatch_data.csv
All data saved to trending_smartwatch_data.csv!


# Preprocess & Assign Sentiment Labels
##### Remove URLs, special characters, and stopwords.
##### Assign sentiment labels manually based on score:
##### Positive (score ≥ 3).
##### Neutral (score = 2).
##### Negative (score ≤ 1).

In [8]:


# Download NLTK stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Clean text function
def clean_text(text):
    """Removes URLs, special characters, and stopwords."""
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z ]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = " ".join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

df = final_data
# Apply text cleaning
df["cleaned_text"] = df["title"].apply(lambda x: clean_text(str(x)))

# Assign sentiment labels based on score
df["sentiment"] = np.where(df["score"] >= 3, "Positive",
                           np.where(df["score"] == 2, "Neutral", "Negative"))

# Drop missing values
df = df.dropna(subset=["cleaned_text", "sentiment"])

# Show cleaned dataset
df.head()

[nltk_data] Downloading package stopwords to C:\Users\Shahzad
[nltk_data]     Iqbal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,score,url,source,video_id,channel,cleaned_text,sentiment
0,suggest smartwatch,2.0,https://www.reddit.com/r/smartwatch/comments/1...,Reddit,,,suggest smartwatch,Neutral
1,best smartwatch iphone widely loved currently,2.0,https://www.reddit.com/r/smartwatch/comments/1...,Reddit,,,best smartwatch iphone widely loved currently,Neutral
2,smartwatch working,2.0,https://www.reddit.com/r/smartwatch/comments/1...,Reddit,,,smartwatch working,Neutral
3,market old smartwatches,4.0,https://www.reddit.com/gallery/1ijcxxm,Reddit,,,market old smartwatches,Positive
4,best smartwatch budget,2.0,https://www.reddit.com/r/smartwatch/comments/1...,Reddit,,,best smartwatch budget,Neutral


# Encode Sentiment Labels & Split Data

##### Encodes sentiment labels into numerical values.
##### Splits data into training and testing sets.

In [9]:
# Encode sentiment labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["sentiment"])  # 0=Negative, 1=Neutral, 2=Positive

# Train-Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(df["cleaned_text"], df["label"], test_size=0.2, random_state=42)

print(f"Training size: {len(train_texts)}, Testing size: {len(test_texts)}")

Training size: 511, Testing size: 128


# Load Pre-Trained DistilBERT Tokenizer

In [10]:
# Load Pre-trained Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize Data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

# Convert Data into PyTorch Dataset

In [11]:
class SentimentDataset(Dataset):
    """Custom Dataset for Tokenized Sentiment Data"""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch Dataset
train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
test_dataset = SentimentDataset(test_encodings, test_labels.tolist())

# Train a Multimodal LLM (DistilBERT)

In [12]:
# Load Pretrained Model for Sentiment Classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Trainer Object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the Model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.818787
2,No log,0.741625
3,No log,0.802121


TrainOutput(global_step=192, training_loss=0.6638236840566, metrics={'train_runtime': 489.7124, 'train_samples_per_second': 3.13, 'train_steps_per_second': 0.392, 'total_flos': 17848489187970.0, 'train_loss': 0.6638236840566, 'epoch': 3.0})

# Evaluate & Save the Model

In [13]:
# Evaluate Model
trainer.evaluate()

# Save Model
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")

print("Model trained and saved successfully!")

Model trained and saved successfully!


# Load the Trained Model & Tokenizer

In [14]:
# Load the trained model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("sentiment_model")
tokenizer = DistilBertTokenizer.from_pretrained("sentiment_model")

# Set model to evaluation mode
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Define a Function for Sentiment Prediction

In [15]:
def predict_sentiment(text):
    """Predict sentiment of a given text using the trained model."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    
    sentiment_labels = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return sentiment_labels[predicted_class]

# Test with Sample Sentences

In [16]:
sample_texts = [
    "This smartwatch is amazing! The battery lasts forever.",
    "The watch is okay, but I expected better features."
]

for text in sample_texts:
    print(f"Text: {text} → Sentiment: {predict_sentiment(text)}")


Text: This smartwatch is amazing! The battery lasts forever. → Sentiment: Neutral
Text: The watch is okay, but I expected better features. → Sentiment: Positive
