<a href="https://colab.research.google.com/github/shoaib247964/-Financial-PhraseBank-Sentiment-Analysis-for-Financial-News-/blob/main/Sentiment_Analysis_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Data Loading and Preprocessing
def load_data():
    # Load Financial PhraseBank dataset
    # Note: Actual dataset loading may vary based on file format
    df = pd.read_csv('/content/financial_phrasebank.csv', encoding='latin1')
    df.columns = ['text', 'label']

    # Map labels to numeric values
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    df['label'] = df['label'].map(label_map)

    return df

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Traditional ML Model (Logistic Regression with TF-IDF)
def train_lr_model(df):
    # Preprocess all texts
    df['processed_text'] = df['text'].apply(preprocess_text)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_text'], df['label'], test_size=0.2, random_state=42
    )

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train Logistic Regression
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train_tfidf, y_train)

    # Evaluate
    y_pred = lr.predict(X_test_tfidf)
    print(classification_report(y_test, y_pred))

    return lr, vectorizer

# LSTM Model
def train_lstm_model(df):
    # Preprocess all texts
    df['processed_text'] = df['text'].apply(preprocess_text)

    # Tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(df['processed_text'])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_text'], df['label'], test_size=0.2, random_state=42
    )

    # Convert texts to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    # Pad sequences
    max_len = 100
    X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_len)
    X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_len)

    # Build LSTM model
    model = Sequential([
        Embedding(input_dim=5000, output_dim=128, input_length=max_len),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Train model
    model.fit(
        X_train_pad, y_train,
        batch_size=32,
        epochs=5,
        validation_data=(X_test_pad, y_test)
    )

    return model, tokenizer

    # Model selection
    model_option = st.selectbox(
        "Select Model",
        ["Logistic Regression", "LSTM", "FinBERT"]
    )

    model, tokenizer_or_vectorizer = models[model_option]

    # Text input
    user_input = st.text_area("Enter financial news headline:", "")

    if st.button("Analyze Sentiment"):
        if user_input:
            # Preprocess input
            processed_text = preprocess_text(user_input)

            # Predict based on selected model
            if model_option == "Logistic Regression":
                # Vectorize input
                input_vec = tokenizer_or_vectorizer.transform([processed_text])
                prediction = model.predict(input_vec)[0]
            elif model_option == "LSTM":
                # Tokenize and pad input
                input_seq = tokenizer_or_vectorizer.texts_to_sequences([processed_text])
                input_pad = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=100)
                prediction = np.argmax(model.predict(input_pad)[0])
            else:  # FinBERT
                # FinBERT implementation would go here
                pass

            # Map prediction to label
            label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
            sentiment = label_map.get(prediction, 'neutral')

            # Display result
            st.subheader("Sentiment Analysis Result")
            st.write(f"Headline: {user_input}")
            st.write(f"Predicted Sentiment: {sentiment}")

            # Visual indicator
            if sentiment == 'positive':
                st.success("✅ Positive sentiment detected")
            elif sentiment == 'negative':
                st.error("❌ Negative sentiment detected")
            else:
                st.info("🔵 Neutral sentiment detected")
        else:
            st.warning("Please enter a news headline to analyze")

def main():
    # Load and prepare data
    df = load_data()

    # Train models (in a real app, you'd load pre-trained models)
    st.write("Training models... (this may take a few minutes)")
    lr_model, tfidf_vectorizer = train_lr_model(df)
    lstm_model, tokenizer = train_lstm_model(df)

    # For FinBERT, we would load a pre-trained model:
    # finbert_model = TFBertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
    # finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

    models = {
        "Logistic Regression": (lr_model, tfidf_vectorizer),
        "LSTM": (lstm_model, tokenizer),
        # "FinBERT": (finbert_model, finbert_tokenizer)
    }



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [8]:
!pip install flask-ngrok flask nltk pandas scikit-learn tensorflow



In [11]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

--2025-03-26 10:54:04--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 75.2.60.68, 99.83.220.108, 35.71.179.82, ...
Connecting to bin.equinox.io (bin.equinox.io)|75.2.60.68|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13921656 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2025-03-26 10:54:09 (21.8 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13921656/13921656]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [12]:
!./ngrok authtoken YOUR_AUTH_TOKEN_HERE

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [2]:
!fuser -k 5000/tcp



In [1]:
!pip install pandas scikit-learn nltk

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import random

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Sample data in case file loading fails
sample_data = {
    'text': [
        "Profit increased by 20% this quarter",
        "Company reports major losses",
        "Board announces new CEO",
        "Stocks reach all time high",
        "Market crashes amid economic crisis"
    ],
    'label': ['positive', 'negative', 'neutral', 'positive', 'negative']
}

# Try loading data or use sample data
try:
    from google.colab import files
    uploaded = files.upload()
    file_name = next(iter(uploaded))
    df = pd.read_csv(io.StringIO(uploaded[file_name].decode('utf-8')))
    if len(df.columns) >= 2:
        df = df.iloc[:, :2]
        df.columns = ['text', 'label']
    print("Loaded CSV file successfully!")
except:
    df = pd.DataFrame(sample_data)
    print("Using sample data instead")

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Preprocess data
df['processed_text'] = df['text'].apply(preprocess_text)

# Train model (or use random if training fails)
try:
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df['processed_text'])

    # Convert labels to numeric
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    y = df['label'].map(label_map)

    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)
    print("Model trained successfully!")
except:
    print("Model training failed - using random predictions")
    model = None

# Prediction function
def predict_sentiment(text):
    if model:
        processed = preprocess_text(text)
        vec = vectorizer.transform([processed])
        pred = model.predict(vec)[0]
        return {0: 'negative', 1: 'neutral', 2: 'positive'}[pred]
    else:
        return random.choice(['positive', 'negative', 'neutral'])

# Test with some examples
test_headlines = [
    "Company profits soar to record high",
    "Stocks plummet amid banking crisis",
    "CEO announces quarterly results",
    "Market shows mixed signals today",
    "New product launch delayed indefinitely"
]

print("\nSample Predictions:")
for headline in test_headlines:
    print(f"'{headline}' -> {predict_sentiment(headline)}")





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Saving financial_phrasebank.csv to financial_phrasebank (1).csv
Using sample data instead
Model trained successfully!

Sample Predictions:
'Company profits soar to record high' -> positive
'Stocks plummet amid banking crisis' -> negative
'CEO announces quarterly results' -> positive
'Market shows mixed signals today' -> negative
'New product launch delayed indefinitely' -> positive


In [2]:
!pip install transformers torch sentencepiece

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import pandas as pd
import numpy as np

# Load FinBERT model (financial domain-specific BERT)
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# Create sentiment analysis pipeline
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

# Sample financial news headlines
headlines = [
    "Apple shares hit record high after strong earnings report",
    "Tesla stock plunges 20% following production delays",
    "Fed announces interest rates will remain unchanged",
    "Bank of America reports 15% drop in quarterly profits",
    "Amazon acquires robotics startup for $1.2 billion"
]

# Classify each headline
for headline in headlines:
    result = nlp(headline)
    print(f"Headline: {headline}")
    print(f"Sentiment: {result[0]['label']} (confidence: {result[0]['score']:.2f})")
    print("---")

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


Headline: Apple shares hit record high after strong earnings report
Sentiment: Positive (confidence: 1.00)
---
Headline: Tesla stock plunges 20% following production delays
Sentiment: Negative (confidence: 1.00)
---
Headline: Fed announces interest rates will remain unchanged
Sentiment: Neutral (confidence: 1.00)
---
Headline: Bank of America reports 15% drop in quarterly profits
Sentiment: Negative (confidence: 1.00)
---
Headline: Amazon acquires robotics startup for $1.2 billion
Sentiment: Neutral (confidence: 1.00)
---


In [3]:
!pip install transformers torch sentencepiece pandas scikit-learn

import torch
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# Load FinBERT model
model_name = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Create pipeline
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Sample dataset (replace with your FinancialPhraseBank)
data = {
    'text': [
        "Profit increased by 20% this quarter",
        "Company reports major losses this year",
        "Stable market conditions observed",
        "Dividend payments suspended indefinitely",
        "Revenue growth exceeds expectations",
        "Layoffs announced across all departments",
        "No significant changes in fiscal policy",
        "Merger deal falls through at last minute"
    ],
    'label': ['positive', 'negative', 'neutral', 'negative',
              'positive', 'negative', 'neutral', 'negative']
}
df = pd.DataFrame(data)

# Convert labels to FinBERT's format
label_map = {'positive': 'Positive', 'negative': 'Negative', 'neutral': 'Neutral'}
df['true_label'] = df['label'].map(label_map)

# Get predictions
def get_prediction(text):
    result = nlp(text)[0]
    return result['label']

df['predicted_label'] = df['text'].apply(get_prediction)

# Calculate metrics
print("\nClassification Report:")
print(classification_report(df['true_label'], df['predicted_label'],
                           target_names=['Negative', 'Neutral', 'Positive']))

f1 = f1_score(df['true_label'], df['predicted_label'],
              average='weighted')
accuracy = accuracy_score(df['true_label'], df['predicted_label'])

print(f"\nWeighted F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(pd.crosstab(df['true_label'], df['predicted_label'],
                 rownames=['Actual'], colnames=['Predicted']))



Device set to use cpu



Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      0.50      0.67         4
     Neutral       0.33      0.50      0.40         2
    Positive       0.67      1.00      0.80         2

    accuracy                           0.62         8
   macro avg       0.67      0.67      0.62         8
weighted avg       0.75      0.62      0.63         8


Weighted F1 Score: 0.6333
Accuracy: 0.6250

Confusion Matrix:
Predicted  Negative  Neutral  Positive
Actual                                
Negative          2        2         0
Neutral           0        1         1
Positive          0        0         2


In [6]:
!pip install transformers torch sentencepiece pandas scikit-learn

import torch
import pandas as pd
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# Load FinBERT with error handling
try:
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
    print("FinBERT loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# Initialize pipeline with optimizations
nlp = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    padding=True,
    max_length=512  # Handle longer financial texts
)

# Load your dataset (replace with actual data)
df = pd.read_csv('/content/financial_phrasebank.csv')
df.columns = ['text', 'label']  # Ensure correct column names

# Map labels to FinBERT's format, handling potential missing keys
label_map = {
    'negative': 'Negative',
    'neutral': 'Neutral',
    'positive': 'Positive'
}
df['true_label'] = df['label'].map(label_map).fillna('Neutral')  # Fill unmapped values with 'Neutral'


# Batch prediction for efficiency
def batch_predict(texts, batch_size=8):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        results.extend(nlp(batch))
    return [r['label'] for r in results]


df['predicted_label'] = batch_predict(df['text'].tolist())

# Evaluation
report = classification_report(
    df['true_label'],
    df['predicted_label'],
    target_names=['Negative', 'Neutral', 'Positive'],
)
print("Classification Report:")
print(report)

# Calculate accuracy
accuracy = (df['true_label'] == df['predicted_label']).mean()
print(f"\nAccuracy: {accuracy:.2%}")

# Filter low-confidence predictions (optional)
df['confidence'] = [r['score'] for r in nlp(df['text'].tolist())]
high_conf_df = df[df['confidence'] > 0.90]  # Only keep 90%+ confident predictions
high_conf_accuracy = (high_conf_df['true_label'] == high_conf_df['predicted_label']).mean()
print(f"High-Confidence Accuracy (threshold=0.90): {high_conf_accuracy:.2%}")



Device set to use cpu


FinBERT loaded successfully!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00         0
     Neutral       1.00      0.66      0.80      2264
    Positive       0.00      0.00      0.00         0

    accuracy                           0.66      2264
   macro avg       0.33      0.22      0.27      2264
weighted avg       1.00      0.66      0.80      2264


Accuracy: 66.48%
High-Confidence Accuracy (threshold=0.90): 67.51%


In [10]:
!pip install transformers torch sentencepiece pandas scikit-learn
!pip install transformers torch sentencepiece pandas scikit-learn

import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.metrics import classification_report

# Load FinBERT
nlp = pipeline(
    "text-classification",
    model="yiyanghkust/finbert-tone",
    return_all_scores=True
)

# Load and prepare data
df = pd.read_csv("/content/financial_phrasebank.csv")
df.columns = ["text", "label"]  # Ensure correct column names

# Convert labels to FinBERT's expected format
label_mapping = {
    "negative": "Negative",
    "neutral": "Neutral",
    "positive": "Positive"
}

# Clean and standardize labels
df["true_label"] = (df["label"].astype(str)
                    .str.lower()
                    .str.strip()
                    .map(label_mapping))

# Handle any remaining missing/unknown labels
df = df[df["true_label"].isin(["Negative", "Neutral", "Positive"])]

# Prediction function with confidence threshold
def predict_with_confidence(text, threshold=0.7):
    try:
        results = nlp(text, truncation=True, max_length=512)[0]
        top_pred = max(results, key=lambda x: x["score"])
        return top_pred["label"] if top_pred["score"] >= threshold else "Neutral"
    except:
        return "Neutral"  # Fallback for errors

# Get predictions
df["predicted"] = df["text"].apply(predict_with_confidence)

# Ensure we only compare aligned labels
valid_labels = ["Negative", "Neutral", "Positive"]
df = df[df["predicted"].isin(valid_labels) & df["true_label"].isin(valid_labels)]

# Evaluation
if not df.empty:
    print(classification_report(
        df["true_label"],
        df["predicted"],
        target_names=valid_labels,
        digits=4
    ))

    accuracy = (df["true_label"] == df["predicted"]).mean()
    print(f"Accuracy: {accuracy:.2%}")
else:
    print("No valid samples for evaluation!")



Device set to use cpu


No valid samples for evaluation!




In [11]:
print("\nTrue Label Distribution:")
print(df["true_label"].value_counts())

print("\nPredicted Label Distribution:")
print(df["predicted"].value_counts())


True Label Distribution:
Series([], Name: count, dtype: int64)

Predicted Label Distribution:
Series([], Name: count, dtype: int64)


In [12]:
print("Original label distribution:")
print(df["label"].value_counts())

Original label distribution:
Series([], Name: count, dtype: int64)


In [13]:
custom_mapping = {
    "bearish": "Negative",
    "bullish": "Positive",
    "neutral": "Neutral",
    # Add your specific labels here
}

In [14]:
def batch_predict(texts, batch_size=8):
    return [predict_with_confidence(text) for text in texts]

df["predicted"] = batch_predict(df["text"].tolist())

In [15]:
print(df["true_label"].value_counts())

Series([], Name: count, dtype: int64)


In [21]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [22]:
import pandas as pd
from transformers import InputExample, InputFeatures, Trainer, TrainingArguments
import torch

# Load Data
# Ensure df is loaded correctly before proceeding
if df.empty:
    print("Warning: DataFrame 'df' is empty! Using fallback sample data.")
    sample_data = {
        'text': [
            "Profit increased by 20% this quarter",
            "Company reports major losses this year",
            "Stock prices are soaring due to market trends"
        ],
        'label': [2, 0, 2]  # 0: negative, 1: neutral, 2: positive
    }
    df = pd.DataFrame(sample_data)

# Convert DataFrame to a list of InputExamples
train_examples = [
    InputExample(guid=index, text_a=row['text'], label=row['label'])
    for index, row in df.iterrows()
]

# Ensure tokenizer is defined
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Convert examples to InputFeatures
def convert_examples_to_features(examples, tokenizer, max_length=128):
    features = []
    for example in examples:
        inputs = tokenizer(example.text_a,
                           add_special_tokens=True,
                           max_length=max_length,
                           truncation=True,
                           padding='max_length')
        features.append(InputFeatures(input_ids=inputs['input_ids'],
                                      attention_mask=inputs['attention_mask'],
                                      label=example.label))
    return features

train_features = convert_examples_to_features(train_examples, tokenizer)

# Convert list of InputFeatures to PyTorch Dataset
class FinancialDataset(torch.utils.data.Dataset):
    def __init__(self, features):
        self.input_ids = torch.tensor([f.input_ids for f in features])
        self.attention_mask = torch.tensor([f.attention_mask for f in features])
        self.labels = torch.tensor([f.label for f in features])

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_mask[idx],
                'labels': self.labels[idx]}

    def __len__(self):
        return len(self.labels)

train_dataset = FinancialDataset(train_features)

# Load Pretrained Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Start Training
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


TrainOutput(global_step=3, training_loss=1.1032311121622722, metrics={'train_runtime': 53.8889, 'train_samples_per_second': 0.111, 'train_steps_per_second': 0.056, 'total_flos': 394670126592.0, 'train_loss': 1.1032311121622722, 'epoch': 3.0})

In [28]:
df = pd.read_csv("/content/financial_phrasebank.csv", encoding='ISO-8859-1')

print("🔍 First 5 rows of the dataset:")
print(df.head())  # Check if the data is loaded correctly
print("\n🔍 Dataset Info:")
print(df.info())  # Check if columns exist and their types


🔍 First 5 rows of the dataset:
                                               label  sentence
0  According to Gran , the company has no plans t...         1
1  For the last quarter of 2010 , Componenta 's n...         2
2  In the third quarter of 2010 , net sales incre...         2
3  Operating profit rose to EUR 13.1 mn from EUR ...         2
4  Operating profit totalled EUR 21.1 mn , up fro...         2

🔍 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2264 entries, 0 to 2263
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     2264 non-null   object
 1   sentence  2264 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 53.1+ KB
None


In [33]:
!pip install transformers torch sentencepiece pandas scikit-learn wandb






In [2]:
# Install necessary libraries


import torch
import pandas as pd
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np

# 1. Load Data Properly
df = pd.read_csv("/content/financial_phrasebank.csv")
df.columns = ["text", "label"]  # Ensure correct column names

# Check if labels are numeric or text-based
print("Label Data Type:", df['label'].dtype)
print("Unique Labels:", df['label'].unique())

# Convert labels to consistent format if necessary
label_map = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# If labels are numeric, no need to map
if df['label'].dtype == 'int64':
    print("✅ Labels are already numeric, skipping mapping.")
else:
    df['label'] = df['label'].astype(str).map(label_map)

# Drop NaN values if any labels failed to map
df = df.dropna()

# 2. Train/Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    stratify=df['label']
)

# 3. Tokenization
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=128
)

test_encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=128
)

# 4. Dataset Class
class FinancialDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FinancialDataset(train_encodings, train_labels)
test_dataset = FinancialDataset(test_encodings, test_labels)

# 5. Model Setup
model = BertForSequenceClassification.from_pretrained(
    'yiyanghkust/finbert-tone',
    num_labels=3,
    ignore_mismatched_sizes=True  # Fixes classifier size mismatch
)

# 6. Training Configuration
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

# 7. Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='weighted')
    }

# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# 9. Fine-Tuning
trainer.train()

# 10. Evaluation
results = trainer.evaluate()
print("\nFinal Evaluation Results:")
print(f"Accuracy: {results['eval_accuracy']:.2%}")
print(f"F1 Score: {results['eval_f1']:.2%}")

# 11. Classification Report
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
print("\nClassification Report:")
print(classification_report(
    test_labels,
    preds,
    target_names=['Negative', 'Neutral', 'Positive']
))


Label Data Type: int64
Unique Labels: [1 2 0]
✅ Labels are already numeric, skipping mapping.




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1698,0.225169,0.94702,0.947777
2,0.0885,0.191086,0.951435,0.950911
3,0.0754,0.178964,0.977925,0.977811
4,0.0039,0.153277,0.977925,0.977843
5,0.0,0.1585,0.977925,0.977843



Final Evaluation Results:
Accuracy: 97.79%
F1 Score: 97.78%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.95      0.95      0.95        61
     Neutral       0.99      1.00      0.99       278
    Positive       0.96      0.95      0.96       114

    accuracy                           0.98       453
   macro avg       0.97      0.96      0.97       453
weighted avg       0.98      0.98      0.98       453



In [3]:
import os

# Check if file exists in current directory
if "finbert_model.pth" in os.listdir():
    print("✅ Model file found!")
else:
    print("❌ Model file NOT found! Upload it first.")


❌ Model file NOT found! Upload it first.


In [4]:
import torch

# Save trained model
model_path = "finbert_model.pth"
torch.save(model.state_dict(), model_path)

print(f"✅ Model saved as {model_path}")


✅ Model saved as finbert_model.pth


In [5]:
from google.colab import files
files.upload()  # Manually select and upload `finbert_model.pth`


{}

In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Model aur Tokenizer Load Karna
model_path = "finbert_model.pth"  # Yeh ensure karo ke yeh correct path ho
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

print("✅ Model Successfully Loaded!")


✅ Model Successfully Loaded!


In [8]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [9]:
import gradio as gr

# Prediction Function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        output = model(**inputs)
    prediction = torch.argmax(output.logits, dim=1).item()

    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[prediction]

# Gradio Interface
iface = gr.Interface(fn=predict_sentiment, inputs="text", outputs="text")
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4f6c560bcf518fea33.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [10]:
!apt install git


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [11]:
!git config --global user.email "muhammadshoaib1194@gmail.com"
!git config --global user.name "Muhammad Shoaib"


In [16]:
!git clone https://github.com/username/repository.git


Cloning into 'repository'...
fatal: could not read Username for 'https://github.com': No such device or address


In [15]:
!mv myfile.ipynb repository/


mv: cannot stat 'myfile.ipynb': No such file or directory


In [14]:
!git push origin main


fatal: not a git repository (or any of the parent directories): .git
