In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Dataset

In [None]:
import pandas as pd
# Load the filtered dataset from Google Drive
filtered_news_df = pd.read_csv('/content/drive/My Drive/filtered_news_df.csv')

# Check the loaded data
print(filtered_news_df.head())


  Trading Code   Post Date                                               News
0     BRACBANK  2009-11-26  As per un-audited quarterly accounts for the 3...
1   PUBALIBANK  2009-11-26  In response to a DSE query dated 25.11.09, the...
2     BXPHARMA  2009-11-26  As per un-audited quarterly accounts for the 3...
3    POWERGRID  2009-11-25  SEC has sought clarification to the Company on...
4           GP  2009-11-24  The company has informed that the company has ...


# FinBERT

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# Load the FinBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Create a sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Function to apply sentiment analysis and adjust thresholds
def apply_sentiment_analysis(text):
    try:
        result = nlp(text)[0]
        label = result['label']
        score = result['score']

        # Adjust sentiment based on confidence score thresholds
        if label == 'Neutral':
            if score >= 0.6:  # Consider scores near the threshold as positive or negative
                if "profit" in text.lower() or "increase" in text.lower() or "growth" in text.lower():
                    label = 'Positive'
                elif "loss" in text.lower() or "decrease" in text.lower() or "decline" in text.lower():
                    label = 'Negative'
        return label
    except Exception as e:
        print(f"Error processing text: {text[:100]}...; Error: {e}")
        return None

# Apply sentiment analysis to the 'News' column using .loc[] to avoid SettingWithCopyWarning
filtered_news_df.loc[:, 'Sentiment'] = filtered_news_df['News'].apply(apply_sentiment_analysis)

# Corrected sentiment mapping dictionary with proper casing
sentiment_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': -1}

# Map sentiment labels to numerical scores using .loc[]
filtered_news_df.loc[:, 'Sentiment_Score'] = filtered_news_df['Sentiment'].map(sentiment_mapping)

# Check for any NaN values in the Sentiment_Score column
nan_sentiments = filtered_news_df[filtered_news_df['Sentiment_Score'].isna()]
if not nan_sentiments.empty:
    print("These rows have NaN sentiment scores and need to be inspected:")
    print(nan_sentiments[['Post Date', 'Trading Code', 'News', 'Sentiment']].head())

# Display the updated dataframe with sentiment scores
print(filtered_news_df[['Post Date', 'Trading Code', 'Sentiment', 'Sentiment_Score']].head())

# Save the results if needed
filtered_news_df.to_csv('/content/drive/My Drive/filtered_news_with_sentiment.csv', index=False)



    Post Date Trading Code Sentiment  Sentiment_Score
0  2009-11-26     BRACBANK  Positive                1
1  2009-11-26   PUBALIBANK   Neutral                0
2  2009-11-26     BXPHARMA  Positive                1
3  2009-11-25    POWERGRID   Neutral                0
4  2009-11-24           GP   Neutral                0


# daily mean

In [None]:
# Group by 'Post Date' and calculate the average sentiment score for each day
daily_sentiment = filtered_news_df.groupby('Post Date').agg({'Sentiment_Score': 'mean'}).reset_index()

# Optionally rename the column to indicate it's an aggregated score
daily_sentiment.rename(columns={'Sentiment_Score': 'Daily_Sentiment_Score'}, inplace=True)

# Display the aggregated sentiment scores by day
print(daily_sentiment.head())

# Save the daily sentiment scores to a new CSV file if needed
daily_sentiment.to_csv('/content/drive/My Drive/daily_sentiment_scores.csv', index=False)


    Post Date  Daily_Sentiment_Score
0  2007-01-25               0.333333
1  2007-01-28               1.000000
2  2007-01-29               0.000000
3  2007-01-31               0.500000
4  2007-02-01               0.000000


#last known sentiment

In [None]:
import pandas as pd

# Load the sentiment data
sentiment_df = pd.read_csv('/content/drive/My Drive/daily_sentiment_scores.csv')

# Ensure the 'Post Date' is in datetime format
sentiment_df['Post Date'] = pd.to_datetime(sentiment_df['Post Date'])

# Create a full date range from the minimum to maximum date in your sentiment data
full_date_range = pd.date_range(start=sentiment_df['Post Date'].min(),
                                end=sentiment_df['Post Date'].max())

# Reindex the sentiment data to include the full date range
sentiment_df = sentiment_df.set_index('Post Date').reindex(full_date_range).rename_axis('Post Date').reset_index()

# Optionally fill missing values (forward fill in this example)
sentiment_df['Daily_Sentiment_Score'].fillna(method='ffill', inplace=True)

# Display the updated sentiment DataFrame
print(sentiment_df.head())



   Post Date  Daily_Sentiment_Score
0 2007-01-25               0.333333
1 2007-01-26               0.333333
2 2007-01-27               0.333333
3 2007-01-28               1.000000
4 2007-01-29               0.000000


  sentiment_df['Daily_Sentiment_Score'].fillna(method='ffill', inplace=True)


In [None]:
sentiment_df.to_csv('/content/drive/My Drive/updated_daily_sentiment_scores.csv', index=False)

In [2]:
!echo "# Financial News Sentiment Analysis" > README.md

