In [25]:
import pandas as pd

In [26]:
df = pd.read_csv("/content/Financial_dataset.csv",encoding='latin1',names=['label','text'])

In [27]:
df.head()

Unnamed: 0,label,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [28]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
neutral,2879
positive,1363
negative,604


In [29]:
from nltk.corpus import wordnet
import random
import pandas as pd
from sklearn.utils import resample
import nltk
nltk.download('wordnet')

def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words[word_idx] = synonym
    return ' '.join(new_words)

def augment_data(df, label, target_size):
    augmented_texts = []
    original_texts = df[df['label'] == label]['text'].tolist()

    while len(augmented_texts) + len(original_texts) < target_size:
        for text in original_texts:
            augmented_texts.append(synonym_replacement(text))
            if len(augmented_texts) + len(original_texts) >= target_size:
                break

    return pd.DataFrame({'text': augmented_texts, 'label': [label] * len(augmented_texts)})

df_positive_aug = augment_data(df, 'positive', target_size=2879)
df_negative_aug = augment_data(df, 'negative', target_size=2879)

df_balanced = pd.concat([df[df['label'] == 'neutral'], df[df['label'] == 'positive'], df_positive_aug, df[df['label'] == 'negative'], df_negative_aug])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['label'].value_counts())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


label
neutral     2879
negative    2879
positive    2879
Name: count, dtype: int64


In [30]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [31]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [32]:
df_balanced['text'] = df_balanced['text'].apply(clean_text)

In [34]:
label_encoder = LabelEncoder()
df_balanced['label'] = label_encoder.fit_transform(df_balanced['label'])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'], df_balanced['label'], test_size=0.2, random_state=42)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

y_pred = lr_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8645833333333334
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       568
           1       0.84      0.83      0.84       592
           2       0.84      0.80      0.82       568

    accuracy                           0.86      1728
   macro avg       0.86      0.87      0.86      1728
weighted avg       0.86      0.86      0.86      1728



In [44]:
import pickle

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("logistic_regression.pkl", "wb") as f:
    pickle.dump(lr_model, f)

print("Models saved successfully!")

Models saved successfully!
