# Financial Sentiment Analysis

[Dataset source](https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis)

In [None]:
!rm -f tracing_database.json
%pip install great-ai > /dev/null

In [None]:
import pandas as pd

data = pd.read_csv('data.csv')

pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 30)
data.head(30)

In [None]:
from great_ai import add_ground_truth, delete_ground_truth

X = data['Sentence'].to_list()
y = data['Sentiment'].to_list()

add_ground_truth(X, y, train_split_ratio=0.85, test_split_ratio=0.15)

In [None]:
from great_ai import query_ground_truth

train_split = query_ground_truth('train')
test_split = query_ground_truth('test')

In [None]:
from great_ai.utilities import clean, simple_parallel_map
import re
from great_ai import Trace

def normalize(text: str) -> str:
    cleaned = clean(text, convert_to_ascii=True).lower()
    return re.sub(r"[^a-z]+", " ", cleaned)

X_train = simple_parallel_map(normalize, [t.input for t in train_split])
X_test = simple_parallel_map(normalize, [t.input for t in test_split])

y_train = [t.output for t in train_split]
y_test = [t.output for t in test_split]

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


def create_pipeline() -> Pipeline:
    return make_pipeline(
        TfidfVectorizer(min_df=5, max_df=0.3, ngram_range=(1, 3), sublinear_tf=True),
        SGDClassifier(max_iter=10000, tol=1e-4, penalty="elasticnet")
    )

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats

optimisation_pipeline = RandomizedSearchCV(
    create_pipeline(),
    {
        "sgdclassifier__alpha": scipy.stats.uniform(0.00005, 0.01),
        "sgdclassifier__l1_ratio": scipy.stats.uniform(0.5, 0.4),
    },
    cv=4,
    n_iter=150,
    verbose=1,
    scoring='f1_macro',
    n_jobs=-1
)

optimisation_pipeline.fit(X_train, y_train)
results = pd.DataFrame(optimisation_pipeline.cv_results_)
results.sort_values("rank_test_score").head(20)

In [None]:
model = create_pipeline()
model.set_params(
    **optimisation_pipeline.best_params_,
    sgdclassifier__max_iter=100000,
    sgdclassifier__tol=1e-5,
)

model.fit(X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics

%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 10)
plt.rcParams["font.size"] = 16

y_predicted = model.predict(X_test)

print(metrics.classification_report(y_test, y_predicted))
metrics.ConfusionMatrixDisplay.from_predictions(
    y_true=y_test,
    y_pred=y_predicted,
    xticks_rotation="vertical",
    normalize="pred",
    values_format=".2f",
)
None

In [None]:
features = model.named_steps["tfidfvectorizer"].get_feature_names_out()

for i, name in enumerate(model.named_steps["sgdclassifier"].classes_):
    weight = model.named_steps["sgdclassifier"].coef_[i]

    print(f'There are {len([w for w in weight if w != 0])} features for the`{name}` class.')

    for w, f in sorted(zip(weight, features), reverse=True)[:15]:
        if w == 0:
            break
        print(f"  {f}: {w:.4f}")

    print()

In [None]:
def predict(text: str):
    text = normalize(text)
    features = model.named_steps["tfidfvectorizer"].transform([text])
    prediction = model.named_steps["sgdclassifier"].predict(features)[0]

    explanation = [
        (feature_name, weight)
        for weight, feature_name in sorted(
            (
                (feature_weight * feature, feature_name)
                for feature_name, feature_weight, feature in zip(
                    model.named_steps["tfidfvectorizer"].get_feature_names_out(),
                    model.named_steps["sgdclassifier"].coef_[list(model.named_steps["sgdclassifier"].classes_).index(prediction)],
                    features.toarray()[0],
                )
                if feature * feature_weight != 0
            ),
            reverse=True,
        )
    ][:10]

    return prediction, explanation

predict('''
    The last 12 months for Tesla shares have been fairly but positively volatile. 
    The stock is up in the past year, as it was trading at just under $700 per share back in early August 2021. 
    The share price spent much of late 2021 and early 2022 over the $1,000 mark. 
    Prices dipped below $1,000—and stayed there—starting in late April.

    I have been bearish on Tesla lately, owing to its elevated share price and its growing competition in the electric vehicle field.
    The competition part isn't changing much. 
    That's really only gotten worse thanks to most of the major automakers looking to get in on the market.

    However, Tesla's move to make its shares more reasonably priced should catch some attention. Thus, I'm moving to neutral on Tesla stock.
''')

In [None]:
# todo: export the model