# **1. Import Libaries**
 Here we load all the Python packages needed to tame our data and text—silencing warnings, wrangling tables with pandas & NumPy, visualizing with Plotly, cleaning language with spaCy & NLTK, building ML pipelines with scikit-learn, and even calling in our trusty DeepRage toolkit. Strap in!

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = "kaggle"

import re
import spacy
from emoji import replace_emoji
import numpy as np
import nltk

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

!pip install --quiet wordfreq
%pip install --quiet git+https://github.com/iseedeep/deeprage.git@main
from deeprage.core import val_pie

# **2. Data Loading & EDA**
Here we’ll load our financial-sentiment dataset, peek at its structure, handle duplicates/nulls, 
  and visualize the sentiment distribution with a pie chart. Think of it as laying out the battle map before the charge!

In [None]:
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
val_pie(df, 'Sentiment')

# **2. Text Processing**
 Time to clean and expand our text: strip HTML and emojis, convert crypto slang acronyms to full phrases, 
  lemmatize with spaCy, and compute Vader sentiment scores. This pipeline turns raw chatter into model-ready features!

In [None]:
chat_word = {
    'FOMO':  'Fear Of Missing Out',
    'FUD':   'Fear Uncertainty Doubt',
    'DYOR':  'Do Your Own Research',
    'BTFD':  'Buy The Fucking Dip',
    'HODL':  'Hold On For Dear Life',
    'ATH':   'All Time High',
    'ATL':   'All Time Low',
    'IPO':   'Initial Public Offering',
    'ROI':   'Return On Investment',
    'EPS':   'Earnings Per Share',
    'P/E':   'Price To Earnings Ratio',
    'YTD':   'Year To Date',
    'YOY':   'Year Over Year',
    'QoQ':   'Quarter Over Quarter',
    'SL':    'Stop Loss',
    'TP':    'Take Profit',
    'PT':    'Price Target',
    'MCAP':  'Market Capitalization',
    'VOL':   'Trading Volume',
    'ETF':   'Exchange Traded Fund',
    'CFD':   'Contract For Difference',
    'MOON':  'To The Moon',
    'BEAR':  'Bearish Sentiment',
    'BULL':  'Bullish Sentiment',
}

_acro_pat = re.compile(r'\b(' + '|'.join(re.escape(k) for k in chat_word) + r')\b',
                       flags=re.IGNORECASE)

In [None]:
class AcronymExpander(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        return [
            _acro_pat.sub(lambda m: chat_word[m.group(1).upper()], txt)
            for txt in X
        ]

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["parser","ner"])

class SpacyCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        out = []
        for doc in nlp.pipe(X, batch_size=50):
            toks = [
                tok.lemma_.lower()
                for tok in doc
                if (
                    not tok.is_stop         # drop “the”, “and”, etc.
                    and not tok.is_punct     # drop punctuation
                    and not tok.like_url     # drop URLs
                    and not tok.like_email   # drop emails
                    and tok.is_alpha         # only letters
                    and tok.lemma_ != "-PRON-"
                )
            ]
            out.append(" ".join(toks))
        return out

In [None]:
def strip_html_emoji(texts):
    cleaned = []
    for t in texts:
        no_html  = re.sub(r'<.*?>', ' ', t)
        no_emoji = replace_emoji(no_html, replace='')
        cleaned.append(no_emoji)
    return cleaned

In [None]:
sia = SentimentIntensityAnalyzer()
def vader_scores(texts):
    return np.array([[sia.polarity_scores(t)['compound']] for t in texts])

In [None]:
pipe = Pipeline([
    ("strip", FunctionTransformer(strip_html_emoji, validate=False)),
    ("acro",  AcronymExpander()),
    ("clean", SpacyCleaner()),
    ("tfidf", TfidfVectorizer(
        tokenizer=str.split,
        preprocessor=lambda x: x,
        token_pattern=None,
        min_df=1,    # keep any token appearing ≥1 doc
        max_df=1.0   # keep tokens no matter how common
    ))
])

# Fit the pipeline on all sentences and transform into sparse TF-IDF features
X_feats = pipe.fit_transform(df['Sentence'])

# **4. Modeling & Evaluation**
Finally, we’ll reduce features to 3D via PCA for a snazzy 3D scatter, then train and compare several classifiers 
  (RandomForest, NB variants, Logistic, SVC). We’ll conclude with detailed classification reports and confusion matrices 
  to crown the sentiment champion!

In [None]:
# Prepare labels and split into train/test sets with stratification
X = X_feats
y = df["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

In [None]:
X_array = X_feats.toarray() if hasattr(X_feats, "toarray") else X_feats

# Use PCA to project features into 3D for interactive exploration
pca     = PCA(n_components=3, random_state=42)
X_3d    = pca.fit_transform(X_array)

# Build a DataFrame for a fancy 3D scatter of the first 200 samples
idx = np.arange(100, 300)
df3 = pd.DataFrame(
    X_3d[idx],
    columns=["x","y","z"]
)
df3["label"] = np.array(y)[idx]

# Plot it with Plotly—because 3D PCA is just plain cool
fig = px.scatter_3d(df3, x="x", y="y", z="z", color="label",
                    title="3D PCA of Text Features")

target = "positive"
matches = df3.index[df3["label"] == target]
if len(matches):
    i = matches[0]
    ann = dict(
        x=df3.at[i,"x"], y=df3.at[i,"y"], z=df3.at[i,"z"],
        text=target, showarrow=True, arrowhead=1
    )
    fig.update_layout(scene=dict(annotations=[ann]))

fig.show(renderer='iframe')

In [None]:
# Define our classifier lineup and specify which need dense input
models = {
    "RandomForest" : RandomForestClassifier(n_estimators=100, random_state=42),
    "GaussianNB"   : GaussianNB(),
    "MultinomialNB": MultinomialNB(),     
    "LogisticReg"  : LogisticRegression(max_iter=1000),
    "LinearSVC"    : LinearSVC(max_iter=10000),
}

_dense_needed = {"GaussianNB", "MultinomialNB"}

# Train each model, print classification reports to compare performance 
for name, clf in models.items():
    # Prepare data
    if name in _dense_needed:
        X_tr = X_train.toarray()
        X_te = X_test.toarray()
    else:
        X_tr, X_te = X_train, X_test

    # Train & predict
    clf.fit(X_tr, y_train)
    preds = clf.predict(X_te)

    # Report
    print(f"\n=== {name} ===")
    print(classification_report(y_test, preds, zero_division=0))

In [None]:
# Generate and display confusion‐matrix heatmaps for each classifier
for name, clf in models.items():
    if name in _dense_needed:
        X_te = X_test.toarray()
    else:
        X_te = X_test

    # get predictions
    y_pred = clf.predict(X_te)

    # build confusion matrix
    labels = clf.classes_
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)

    # plot
    fig = ff.create_annotated_heatmap(
        z=cm_df.values,
        x=list(labels),
        y=list(labels),
        colorscale="Blues",
        showscale=True,
        reversescale=True
    )
    fig.update_layout(
        title=f"{name} Confusion Matrix",
        xaxis_title="Predicted",
        yaxis_title="Actual",
        width=600, height=500
    )
    fig.show(renderer="iframe_connected")