# Text Classification using Logistic Regression — SOLVED

This notebook provides a complete, **offline-friendly** solution for binary sentiment classification using scikit‑learn's **LogisticRegression**. It tries to load `twitter-sa.csv` from the working directory; if not found, it falls back to a small in‑notebook dataset so you can run everything without downloads.

## 1. Imports

In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import matplotlib.pyplot as plt
import seaborn as sns

# For saving artifacts (optional)
import joblib


## 2. Load data
The original tutorial uses Sentiment140 (`twitter-sa.csv`). We will attempt to read it. If it's not available, we'll generate a small labeled dataset inline.

In [None]:
from pathlib import Path

def load_or_create_dataset(path='twitter-sa.csv', sample_size=None):
    p = Path(path)
    if p.exists():
        # Dataset columns in many Sentiment140 dumps: target, id, date, flag, user, text
        df = pd.read_csv(p, encoding='ISO-8859-1', header=None,
                         names=['target','id','date','flag','user','text'])
        # keep only class and text
        df = df[['target','text']].dropna()
        # Normalize target to {0,1}
        # In Sentiment140: 0=negative, 4=positive
        df['target'] = (df['target'] == 4).astype(int)
        if sample_size is not None:
            df = df.sample(min(sample_size, len(df)), random_state=42).reset_index(drop=True)
        return df
    else:
        # Fallback mini dataset (balanced, 80 examples)
        pos = [
            "I love this movie so much!", "What a wonderful day!", "Best service ever, highly recommend.",
            "Absolutely fantastic experience.", "Great vibes, feeling amazing!", "This is awesome!",
            "I am very happy with the results.", "The product works perfectly.", "Such a pleasant surprise.",
            "Everything went smoothly and I’m satisfied.", "Brilliant performance!", "The food was delicious.",
            "So proud of myself today.", "I like this approach a lot.", "Smiling all day long.",
            "Superb quality and great support.", "The lecture was inspiring!", "Thank you, I'm grateful.",
            "I enjoyed every minute of it.", "This app is very helpful."
        ]
        neg = [
            "I hate this, worst experience.", "Terrible service and rude staff.", "Absolutely disappointed.",
            "This is awful!", "I'm very unhappy with the product.", "What a waste of time.",
            "Nothing works as expected.", "So frustrating and annoying.", "Bad quality, not worth it.",
            "Everything went wrong... again.", "Horrible performance.", "The food was disgusting.",
            "I feel sad and exhausted.", "I don't like this at all.", "Crying all night.",
            "Poor quality and no support.", "The lecture was boring.", "This is unacceptable.",
            "I regret buying this.", "The app is useless."
        ]
        df = pd.DataFrame({
            'text': pos + neg,
            'target': [1]*len(pos) + [0]*len(neg)
        })
        return df

df = load_or_create_dataset(sample_size=50000)  # sample if the full csv is large
print(df.head(), "\n")
print("Class balance:\n", df['target'].value_counts())


## 3. Preprocessing
We'll implement a light text cleaner and use `CountVectorizer`.

In [None]:
class SimpleCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Precompile regexes for speed
        self.url_re = re.compile(r'https?://\S+|www\.\S+')
        self.mention_re = re.compile(r'@[A-Za-z0-9_]+')
        self.hashtag_re = re.compile(r'#([A-Za-z0-9_]+)')
        self.non_alpha_re = re.compile(r'[^a-z\s]')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        cleaned = []
        for text in X.astype(str):
            t = text.lower()
            t = self.url_re.sub(' ', t)
            t = self.mention_re.sub(' ', t)
            t = self.hashtag_re.sub(' \1 ', t)  # keep the word, drop '#'
            t = self.non_alpha_re.sub(' ', t)
            t = re.sub(r'\s+', ' ', t).strip()
            cleaned.append(t)
        return pd.Series(cleaned)

# Quick smoke test
_ = SimpleCleaner().fit_transform(pd.Series(["Check this out: https://x.y", "@user #HappyDay!!!"])).tolist()


## 4. Train / Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42, stratify=df['target']
)
len(X_train), len(X_test), y_train.mean(), y_test.mean()


## 5. Vectorize + Logistic Regression (Pipeline)

In [None]:
# Build a simple pipeline: Clean -> CountVectorizer -> LogisticRegression
pipe = Pipeline([
    ('clean', SimpleCleaner()),
    ('vec', CountVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2), stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, n_jobs=None))
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f"Accuracy: {acc:.4f}\n")
print(classification_report(y_test, pred, digits=4))


## 6. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, pred, labels=[0,1])
df_cm = pd.DataFrame(cm, index=['neg(0)','pos(1)'], columns=['neg(0)','pos(1)'])
print(df_cm)

plt.figure(figsize=(4,3))
sns.heatmap(df_cm, annot=True, fmt='d', cbar=False)
plt.title("Confusion Matrix")
plt.ylabel("True")
plt.xlabel("Predicted")
plt.tight_layout()
plt.show()


## 7. Inference examples

In [None]:
examples = [
    "Life is a journey, enjoy it.",
    "Mondays are the worst...",
    "I am so happy with this new phone!",
    "This service is terrible.",
    "Absolutely fantastic performance tonight!",
    "I regret buying this."
]
pred_ex = pipe.predict(pd.Series(examples))
for t, y in zip(examples, pred_ex):
    print(f"{t}  ->  {'positive' if y==1 else 'negative'}")


## 8. Save artifacts (optional)
This lets you load and use the model later from a script.

In [None]:
joblib.dump(pipe, '/mnt/data/logreg_text_clf.joblib')
print("Saved model to /mnt/data/logreg_text_clf.joblib")
