<a href="https://colab.research.google.com/github/sashank1326/ML_Practice/blob/main/ML_Assign_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Imports
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import pandas as pd
import numpy as np

# *Load Dataset*

In [None]:

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/ML_LAB/eng_dataset.csv")   # change filename if different

# Show shape and first rows
print("Dataset shape:", df.shape)
print(df.head())


Dataset shape: (7102, 3)
      ID sentiment                                            content
0  10941     anger  At the point today where if someone says somet...
1  10942     anger  @CorningFootball  IT'S GAME DAY!!!!      T MIN...
2  10943     anger  This game has pissed me off more than any othe...
3  10944     anger  @spamvicious I've just found out it's Candice ...
4  10945     anger  @moocowward @mrsajhargreaves @Melly77 @GaryBar...


# **Baseline**

In [None]:

# Features and labels
X = df["content"]
y = df["sentiment"]

# Bag-of-Words (no TF-IDF, just counts)
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

# Split into train/test (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.3, stratify=y)

# Train Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Report
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.80      0.75      0.77      1191
        fear       0.74      0.84      0.79      1577
         joy       0.86      0.79      0.82      1131
     sadness       0.75      0.70      0.72      1073

    accuracy                           0.78      4972
   macro avg       0.79      0.77      0.78      4972
weighted avg       0.78      0.78      0.78      4972

Accuracy: 0.7787610619469026


# **Using TF-IDF**

In [None]:
X = df["content"]
y = df["sentiment"]

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")

# Transform text
X_tfidf = vectorizer.fit_transform(X)

# Model
model = LogisticRegression(max_iter=100, solver="liblinear")

# Stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True)

# Cross-validation predictions (no loops!)
y_pred = cross_val_predict(model, X_tfidf, y, cv=cv)

# Evaluation
print("ðŸ“Š Classification Report (5-fold CV):")
print(classification_report(y, y_pred))
print("Accuracy:", accuracy_score(y, y_pred))


ðŸ“Š Classification Report (5-fold CV):
              precision    recall  f1-score   support

       anger       0.90      0.84      0.87      1701
        fear       0.77      0.93      0.84      2252
         joy       0.94      0.87      0.90      1616
     sadness       0.85      0.71      0.78      1533

    accuracy                           0.85      7102
   macro avg       0.86      0.84      0.85      7102
weighted avg       0.86      0.85      0.85      7102

Accuracy: 0.8476485497043087


# **Using RandomizedSearchCV**

In [None]:
# 1. Load Dataset

X = df["content"]
y = df["sentiment"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y
)

# 2. TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. RandomizedSearchCV
param_dist = {
    "C": np.logspace(-3, 3, 7),   # regularization strength
    "penalty": ["l2"],            # keep it simple (l1 also possible with saga)
    "solver": ["liblinear", "saga"]  # solvers compatible with l2
}

rand = RandomizedSearchCV(
    LogisticRegression(max_iter=100),param_dist,n_iter=5, cv=3,scoring="accuracy", n_jobs=-1)
rand.fit(X_train_vec, y_train)

#Evaluating
print("Best Params:", rand.best_params_)

best_model = rand.best_estimator_
y_pred = best_model.predict(X_test_vec)

print("\nðŸ“Š Classification Report:")
print(classification_report(y_test, y_pred))
print("âœ… Accuracy:", accuracy_score(y_test, y_pred))


Best Params: {'solver': 'liblinear', 'penalty': 'l2', 'C': np.float64(1000.0)}

ðŸ“Š Classification Report:
              precision    recall  f1-score   support

       anger       0.88      0.87      0.87       510
        fear       0.84      0.87      0.86       676
         joy       0.93      0.94      0.93       485
     sadness       0.81      0.76      0.78       460

    accuracy                           0.86      2131
   macro avg       0.86      0.86      0.86      2131
weighted avg       0.86      0.86      0.86      2131

âœ… Accuracy: 0.8615673392773345


# **Using SVM**

In [None]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score


X = df["content"]
y = df["sentiment"]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3 ,stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=50000,       # limit features for speed
    ngram_range=(1,2),        # unigrams + bigrams
    min_df=2                  # ignore very rare words
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Linear SVM
svm_clf = LinearSVC(max_iter=1000, random_state=42)
svm_clf.fit(X_train_vec, y_train)

# Evaluate
y_pred_svm = svm_clf.predict(X_test_vec)

print("ðŸ“Š Linear SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("âœ… Accuracy:", accuracy_score(y_test, y_pred_svm))


ðŸ“Š Linear SVM Classification Report:
              precision    recall  f1-score   support

       anger       0.90      0.84      0.87       510
        fear       0.86      0.89      0.88       676
         joy       0.89      0.91      0.90       485
     sadness       0.83      0.81      0.82       460

    accuracy                           0.87      2131
   macro avg       0.87      0.86      0.87      2131
weighted avg       0.87      0.87      0.87      2131

âœ… Accuracy: 0.8676677616142656


# **Ensemble-SVM,NaiveBaise,log_reg**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Define Base Models
log_reg = LogisticRegression(C=1000, penalty="l2", solver="liblinear", max_iter=1000)
svm = LinearSVC(random_state=42)   # Linear SVM
nb = MultinomialNB()               # Naive Bayes

# 2. Create Voting Ensemble
voting_clf = VotingClassifier(
    estimators=[
        ("log_reg", log_reg),
        ("svm", svm),
        ("nb", nb)
    ],
    voting="hard"   # try "soft" if all classifiers have predict_proba
)


# 3. Train Ensemble
voting_clf.fit(X_train_vec, y_train)

# 4. Evaluate
y_pred_vote = voting_clf.predict(X_test_vec)

print("ðŸ“Š Voting Classifier Report:")
print(classification_report(y_test, y_pred_vote))
print("âœ… Accuracy:", accuracy_score(y_test, y_pred_vote))


ðŸ“Š Voting Classifier Report:
              precision    recall  f1-score   support

       anger       0.88      0.88      0.88       510
        fear       0.84      0.91      0.87       676
         joy       0.94      0.94      0.94       485
     sadness       0.85      0.75      0.79       460

    accuracy                           0.87      2131
   macro avg       0.88      0.87      0.87      2131
weighted avg       0.87      0.87      0.87      2131

âœ… Accuracy: 0.8723603941811356


# **Ensemble-SGD and SVM**

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Define Base Models
sgd = SGDClassifier(loss="log_loss", max_iter=1000)
svm = LinearSVC()

# 2. Create Voting Ensemble
voting_clf = VotingClassifier(
    estimators=[
        ("sgd", sgd),
        ("svm", svm)
    ],
    voting="hard"   # LinearSVC doesnâ€™t support predict_proba â†’ so "hard"
)

# 3. Train Ensemble
voting_clf.fit(X_train_vec, y_train)

# 4. Evaluate
y_pred_vote = voting_clf.predict(X_test_vec)

print("ðŸ“Š SGD + SVM Voting Classifier Report:")
print(classification_report(y_test, y_pred_vote))
print("âœ… Accuracy:", accuracy_score(y_test, y_pred_vote))


ðŸ“Š SGD + SVM Voting Classifier Report:
              precision    recall  f1-score   support

       anger       0.89      0.88      0.89       510
        fear       0.86      0.92      0.89       676
         joy       0.92      0.91      0.92       485
     sadness       0.84      0.77      0.80       460

    accuracy                           0.88      2131
   macro avg       0.88      0.87      0.87      2131
weighted avg       0.88      0.88      0.87      2131

âœ… Accuracy: 0.8751759737212577


# **Supervised Learning with Unsupervised Feature Augmentation**

In [None]:
# Imports

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score
import scipy.sparse as sp

# Split dataset
X = df["content"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,3),
    min_df=2
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Clustering (KMeans)
kmeans = KMeans(n_clusters=5)
clusters_train = kmeans.fit_predict(X_train_vec)
clusters_test = kmeans.predict(X_test_vec)

# Add cluster labels as a new feature
X_train_aug = sp.hstack([X_train_vec, clusters_train.reshape(-1,1)])
X_test_aug = sp.hstack([X_test_vec, clusters_test.reshape(-1,1)])


# Train Linear SVM on augmented features
svm_clf = LinearSVC(max_iter=6000)
svm_clf.fit(X_train_aug, y_train)


# Evaluate
y_pred = svm_clf.predict(X_test_aug)
print("ðŸ“Š Linear SVM + Clustering Features Classification Report:")
print(classification_report(y_test, y_pred))
print("âœ… Accuracy:", accuracy_score(y_test, y_pred))


ðŸ“Š Linear SVM + Clustering Features Classification Report:
              precision    recall  f1-score   support

       anger       0.89      0.89      0.89       340
        fear       0.90      0.90      0.90       451
         joy       0.92      0.93      0.93       323
     sadness       0.83      0.80      0.82       307

    accuracy                           0.89      1421
   macro avg       0.88      0.88      0.88      1421
weighted avg       0.89      0.89      0.89      1421

âœ… Accuracy: 0.8859957776213934
