In [3]:
import pandas as pd
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [4]:
pd.set_option("display.width", 200)
pd.set_option("display.max_columns", None)
import warnings
warnings.filterwarnings("ignore")

In [19]:
# Downloading NLTK Resources

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\samra\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
file_path = 'dataset.csv'
df = pd.read_csv(file_path, header=0)
print(df.head())
print("Shape : ",df.shape)

       textID                                               text sentiment
0  cb774db0d1                I`d have responded, if I were going   neutral
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!  negative
2  088c60f138                          my boss is bullying me...  negative
3  9642c003ef                     what interview! leave me alone  negative
4  358bd9e861   Sons of ****, why couldn`t they put them on t...  negative
Shape :  (31015, 3)


### Preprocessing

In [6]:
if "textID" in df.columns:
    df = df.drop(columns=["textID"])
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [22]:
print(df.isnull().sum())

text         1
sentiment    0
dtype: int64


In [23]:
df = df.dropna(subset=["text"])
print(df.isnull().sum())

text         0
sentiment    0
dtype: int64


In [24]:
print(df.sentiment.unique())

['neutral' 'negative' 'positive']


In [25]:
print(df['sentiment'].value_counts())

sentiment
neutral     12547
positive     9685
negative     8782
Name: count, dtype: int64


In [26]:
duplicates = df.duplicated(subset=["text", "sentiment"], keep=False)
dup_rows = df[duplicates].sort_values(by=["text", "sentiment"])
dup_count = dup_rows.shape[0]
print(f"Total Repeated Pairs : {dup_count}")

Total Repeated Pairs : 0


In [27]:
df.shape

(31014, 2)

In [28]:
df = df.drop_duplicates(subset=["text", "sentiment"], keep="first").reset_index(drop=True)
print(f"After dropping duplicates: {df.shape}")

After dropping duplicates: (31014, 2)


### Cleaning Data

In [29]:
# Lowercasing

df['text'] = df['text'].str.lower()
print(df.head())

                                                text sentiment
0                i`d have responded, if i were going   neutral
1      sooo sad i will miss you here in san diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   sons of ****, why couldn`t they put them on t...  negative


In [30]:
# Removing Noise

def clean_text(t):
    t = re.sub(r"http\S+|www\S+|https\S+", "", t)
    t = re.sub(r"@\w+", "", t)
    t = re.sub(r"#\w+", "", t)
    t = re.sub(r"[^a-zA-Z\s]", "", t)
    return t

df['text'] = df['text'].apply(clean_text)
print(df.head())

                                                text sentiment
0                  id have responded if i were going   neutral
1         sooo sad i will miss you here in san diego  negative
2                             my boss is bullying me  negative
3                      what interview leave me alone  negative
4   sons of  why couldnt they put them on the rel...  negative


In [31]:
# Emojis to Words

def convert_emojis(t):
    return emoji.demojize(t, delimiters=(" ", " "))

df["text"] = df["text"].apply(convert_emojis)
print(df.head())

                                                text sentiment
0                  id have responded if i were going   neutral
1         sooo sad i will miss you here in san diego  negative
2                             my boss is bullying me  negative
3                      what interview leave me alone  negative
4   sons of  why couldnt they put them on the rel...  negative


In [32]:
# Tokenization

df["tokens"] = df["text"].apply(lambda x: x.split())
print(df.head())

                                                text sentiment                                             tokens
0                  id have responded if i were going   neutral          [id, have, responded, if, i, were, going]
1         sooo sad i will miss you here in san diego  negative  [sooo, sad, i, will, miss, you, here, in, san,...
2                             my boss is bullying me  negative                       [my, boss, is, bullying, me]
3                      what interview leave me alone  negative                [what, interview, leave, me, alone]
4   sons of  why couldnt they put them on the rel...  negative  [sons, of, why, couldnt, they, put, them, on, ...


In [33]:
# Stopword Removal

stop_words = set(stopwords.words("english"))
df["tokens"] = df["tokens"].apply(lambda words: [w for w in words if w not in stop_words])
print(df.head())

                                                text sentiment                                           tokens
0                  id have responded if i were going   neutral                           [id, responded, going]
1         sooo sad i will miss you here in san diego  negative                    [sooo, sad, miss, san, diego]
2                             my boss is bullying me  negative                                 [boss, bullying]
3                      what interview leave me alone  negative                        [interview, leave, alone]
4   sons of  why couldnt they put them on the rel...  negative  [sons, couldnt, put, releases, already, bought]


In [34]:
# Lemmatization

lemmatizer = WordNetLemmatizer()
df["tokens"] = df["tokens"].apply(lambda words: [lemmatizer.lemmatize(w) for w in words])
print(df.head())

                                                text sentiment                                         tokens
0                  id have responded if i were going   neutral                         [id, responded, going]
1         sooo sad i will miss you here in san diego  negative                  [sooo, sad, miss, san, diego]
2                             my boss is bullying me  negative                                [bos, bullying]
3                      what interview leave me alone  negative                      [interview, leave, alone]
4   sons of  why couldnt they put them on the rel...  negative  [son, couldnt, put, release, already, bought]


In [35]:
# Joining Tokens into Text for Vectorization

df["clean_text"] = df["tokens"].apply(lambda x: " ".join(x))
print(df.head())

                                                text sentiment                                         tokens                              clean_text
0                  id have responded if i were going   neutral                         [id, responded, going]                      id responded going
1         sooo sad i will miss you here in san diego  negative                  [sooo, sad, miss, san, diego]                 sooo sad miss san diego
2                             my boss is bullying me  negative                                [bos, bullying]                            bos bullying
3                      what interview leave me alone  negative                      [interview, leave, alone]                   interview leave alone
4   sons of  why couldnt they put them on the rel...  negative  [son, couldnt, put, release, already, bought]  son couldnt put release already bought


In [36]:
vectorizer = TfidfVectorizer(max_features=10000,ngram_range=(1,3),min_df=2,max_df=0.9)

x = vectorizer.fit_transform(df["clean_text"])
print("Final shape:", x.shape)


Final shape: (31014, 10000)


### Train Test Split and Training Model

In [37]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [38]:
le = LabelEncoder()
y = le.fit_transform(df["sentiment"])

print("Classes:", le.classes_)
print("First 10 labels:", y[:10])

Classes: ['negative' 'neutral' 'positive']
First 10 labels: [1 0 0 0 0 1 2 1 1 2]


In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

print("Train:", x_train.shape, y_train.shape)
print("Test:", x_test.shape, y_test.shape)

Train: (24811, 10000) (24811,)
Test: (6203, 10000) (6203,)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, C=2, solver="liblinear", class_weight="balanced", n_jobs=-1),
    "Naive Bayes": MultinomialNB(alpha=0.5),
    "Linear SVM": LinearSVC(C=2, dual=False, max_iter=3000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "KNN": KNeighborsClassifier(n_neighbors=3)
}
results = []
for name, model in models.items():
    print(f"Evaluating: {name}")
    acc_scores = cross_val_score(model, x_train, y_train, cv=3, scoring="accuracy", n_jobs=-1)
    precisions = cross_val_score(model, x_train, y_train, cv=3, scoring="precision_macro", n_jobs=-1)
    recalls = cross_val_score(model, x_train, y_train, cv=3, scoring="recall_macro", n_jobs=-1)

    results.append({
        "Model": name,
        "Mean Accuracy": acc_scores.mean(),
        "Acc Std Dev": acc_scores.std(),
        "Mean Precision": precisions.mean(),
        "Mean Recall": recalls.mean()
    })

df_results = pd.DataFrame(results).sort_values(by="Mean Accuracy", ascending=False)
print(df_results)

‚è≥ Evaluating: Logistic Regression
‚è≥ Evaluating: Naive Bayes
‚è≥ Evaluating: Linear SVM
‚è≥ Evaluating: Decision Tree
‚è≥ Evaluating: KNN
                 Model  Mean Accuracy  Acc Std Dev  Mean Precision  Mean Recall
0  Logistic Regression       0.683447     0.001989        0.690788     0.680262
2           Linear SVM       0.638225     0.003871        0.641582     0.639129
1          Naive Bayes       0.628632     0.002371        0.658528     0.614047
3        Decision Tree       0.542582     0.004987        0.679280     0.493168
4                  KNN       0.468421     0.002419        0.538769     0.422283


## DistilBert Model

In [41]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [None]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}
df["label"] = df["sentiment"].map(label2id)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
test_dataset = Dataset.from_pandas(test_df[["text", "label"]])
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
train_dataset = train_dataset.map(tokenize, batched=True)

test_dataset = test_dataset.map(tokenize, batched=True)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=3,id2label=id2label,label2id=label2id)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1}

training_args = TrainingArguments(output_dir="./results",eval_steps=500,save_steps=500,save_total_limit=2,learning_rate=2e-5,
                                  per_device_train_batch_size=16,per_device_eval_batch_size=16,num_train_epochs=3,weight_decay=0.01,
                                  logging_dir="./logs",logging_steps=50)

trainer = Trainer(model=model,args=training_args,train_dataset=train_dataset,eval_dataset=test_dataset,tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

trainer.train()

print(trainer.evaluate())

model.save_pretrained("./Model/distilbert-sentiment")
tokenizer.save_pretrained("./Model/distilbert-sentiment")

![Output](./Model/distilbert-sentiment/ss1.png)
...
![Output](./Model/distilbert-sentiment/ss2.png)

### Loading Model To Test

In [None]:
model_path = "./Model/distilbert-sentiment"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}

df["label"] = df["sentiment"].map(label2id)

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, shuffle=True
)

test_dataset = Dataset.from_pandas(test_df)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

test_dataset = test_dataset.map(tokenize, batched=True)
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1}

trainer = Trainer(model=model, tokenizer=tokenizer)
preds_output = trainer.predict(test_dataset)
metrics = compute_metrics(preds_output)
print("Evaluation Metrics:", metrics)

Map:   0%|          | 0/6203 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Evaluation Metrics: {'accuracy': 0.889085926164759, 'precision': 0.8892436836996468, 'recall': 0.889085926164759, 'f1': 0.8891503188422494}


In [53]:
sample_texts = [
    "Just copped the new Nike Air Zooms üî•üî• best running shoes I‚Äôve had in years!",
    "Nike really nailed it with their sustainable collection üëè eco-friendly AND stylish.",
    "That Nike ad gave me chills‚Ä¶ they know how to inspire athletes at every level üíØ.",
    "Nike shoes are overpriced now, not worth what they used to be üòí.",
    "Ordered from Nike‚Äôs site and shipping is taking forever‚Ä¶ super disappointing.",
    "The quality of Nike hoodies has gone down. My old one lasted years, new one faded in months.",
    "Nike just announced a new collab with an NBA player.",
    "Saw Nike‚Äôs latest campaign during the game last night.",
    "Nike store at the mall is getting renovated, opening next week.",
    "Nike has been in the sportswear market since 1964."
]

inputs = tokenizer(sample_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
preds = np.argmax(probs, axis=1)

for text, pred, prob in zip(sample_texts, preds, probs):
    prob_dict = {id2label[i]: round(float(p), 3) for i, p in enumerate(prob)}
    print(f"\nTweet: {text}")
    print(f"Predicted Sentiment: {id2label[pred]} (probs={prob_dict})")


Tweet: Just copped the new Nike Air Zooms üî•üî• best running shoes I‚Äôve had in years!
Predicted Sentiment: positive (probs={'negative': 0.004, 'neutral': 0.01, 'positive': 0.987})

Tweet: Nike really nailed it with their sustainable collection üëè eco-friendly AND stylish.
Predicted Sentiment: positive (probs={'negative': 0.004, 'neutral': 0.008, 'positive': 0.988})

Tweet: That Nike ad gave me chills‚Ä¶ they know how to inspire athletes at every level üíØ.
Predicted Sentiment: positive (probs={'negative': 0.009, 'neutral': 0.047, 'positive': 0.944})

Tweet: Nike shoes are overpriced now, not worth what they used to be üòí.
Predicted Sentiment: negative (probs={'negative': 0.966, 'neutral': 0.029, 'positive': 0.006})

Tweet: Ordered from Nike‚Äôs site and shipping is taking forever‚Ä¶ super disappointing.
Predicted Sentiment: negative (probs={'negative': 0.99, 'neutral': 0.007, 'positive': 0.003})

Tweet: The quality of Nike hoodies has gone down. My old one lasted years, new 