# Twitter Sentiment Project Shared Utilities
This notebook provides shared code (cleaning, feature engineering, and pipeline builders) that other notebooks will `%run`.

**How to use:** In any notebook, run:
```python
%run ./00_shared_utils.ipynb
```

In [3]:
%pip install -q scikit-learn imbalanced-learn xgboost shap emoji
%pip install -q sentence-transformers faiss-cpu

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
import re, html, json, math, random
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Any, Dict, Optional
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.validation import check_is_fitted
from scipy import sparse

RANDOM_STATE = 42

def set_seeds(seed=RANDOM_STATE):
    import os, random, numpy as np
    random.seed(seed)
    np.random.seed(seed)

_contractions = {
    "can't": "can not", "won't": "will not", "n't": " not",
    "i'm": "i am", "it's": "it is", "that's": "that is",
    "what's": "what is", "there's": "there is", "i've": "i have",
    "you're": "you are", "they're": "they are", "we're": "we are",
    "i'll": "i will", "you'll": "you will", "they'll": "they will",
    "i'd": "i would", "you'd": "you would", "they'd": "they would"
}

url_re = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
mention_re = re.compile(r"@\w+")
hashtag_re = re.compile(r"#(\w+)")
elong_re = re.compile(r"(.)\1{2,}")
ws_re = re.compile(r"\s+")

emoji_re = re.compile(r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]", flags=re.UNICODE)

def expand_contractions(text: str) -> str:
    t = text
    for k, v in _contractions.items():
        t = re.sub(re.escape(k), v, t, flags=re.IGNORECASE)
    return t

def normalize_text(t: str) -> Dict[str, Any]:
    if not isinstance(t, str):
        t = "" if t is None else str(t)
    raw = t
    t = t.strip().lower()
    t = html.unescape(t)

    url_count = len(url_re.findall(t))
    t = url_re.sub(" ", t)

    mention_count = len(mention_re.findall(t))
    t = mention_re.sub(" ", t)

    hashtags = hashtag_re.findall(t)
    hashtag_count = len(hashtags)
    t = hashtag_re.sub(lambda m: " " + m.group(1) + " ", t)

    has_emoji = 1 if emoji_re.search(t) else 0


    t = expand_contractions(t)
    t = elong_re.sub(r"\1\1", t)

    t = re.sub(r"[^a-z0-9\s!\?\.,' ]+", " ", t)
    t = ws_re.sub(" ", t).strip()

    return {
        "text_clean": t,
        "url_count": url_count,
        "mention_count": mention_count,
        "hashtag_count": hashtag_count,
        "has_emoji": has_emoji,
        "text_len": len(t),
        "raw": raw
    }

class CleaningTransformer(BaseEstimator, TransformerMixin):
    """Transforms a DataFrame with columns ['text', 'selected_text'] to a new DataFrame
    containing 'text_clean' and numeric features including keyword overlap.

    Expects columns: 'text' and optionally 'selected_text'.

    Returns a pandas DataFrame for downstream selectors.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.Series):
            X = X.to_frame(name="text")
        df = X.copy()
        if 'text' not in df.columns:
            for c in df.columns:
                if 'text' in c.lower():
                    df = df.rename(columns={c:'text'})
                    break
        results = [normalize_text(t) for t in df['text'].fillna("")]
        out = pd.DataFrame(results, index=df.index)
        if 'selected_text' in df.columns:
            sel = df['selected_text'].fillna("").str.lower().str.split()
            txt = out['text_clean'].fillna("").str.split()
            overlap = []
            for s, tks in zip(sel, txt):
                if not s or not tks:
                    overlap.append(0.0)
                else:
                    sset = set(s)
                    count = sum(1 for w in tks if w in sset)
                    overlap.append(count / (len(s) + 1e-9))
            out['keyword_overlap'] = overlap
        else:
            out['keyword_overlap'] = 0.0
        return out

def _select_single_column(df, name):
    if isinstance(df, pd.DataFrame):
        column = df[name]
        return column.values if isinstance(column, pd.Series) else column
    return df

def _select_multi_columns(df, names):
    if isinstance(df, pd.DataFrame):
        return df[names].values
    return df

def select_col(name):
    return FunctionTransformer(_select_single_column, kw_args={'name': name}, validate=False)

def select_cols(names):
    return FunctionTransformer(_select_multi_columns, kw_args={'names': names}, validate=False)

class ToSparse(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = np.asarray(X)
        return sparse.csr_matrix(X)

def build_feature_union(word_ng=(1,2), char_ng=(3,5), min_df=3):
    word = Pipeline([
        ('sel', select_col('text_clean')),
        ('tfidf', TfidfVectorizer(ngram_range=word_ng, min_df=min_df))
    ])
    char = Pipeline([
        ('sel', select_col('text_clean')),
        ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=char_ng, min_df=min_df))
    ])
    numeric = Pipeline([
        ('sel', select_cols(['url_count','mention_count','hashtag_count','has_emoji','text_len','keyword_overlap'])),
        ('tosparse', ToSparse()),
        ('scale', MaxAbsScaler())
    ])
    return FeatureUnion([
        ('word', word),
        ('char', char),
        ('num', numeric)
    ])

def build_full_pipeline(classifier):
    return Pipeline([
        ('clean', CleaningTransformer()),
        ('features', build_feature_union()),
        ('clf', classifier)
    ])

print("Shared utilities loaded. Use build_full_pipeline(classifier) to create a text model pipeline.")


Shared utilities loaded. Use build_full_pipeline(classifier) to create a text model pipeline.
