In [1]:
import os
import glob
import pandas as pd
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Set the path to the folder containing the CSV files
Path_sentiment = os.getcwd()+'/Sentiment analysis'

# Set the path to the folder containing the news_final_1 CSV file
Path_news_final_1 = os.getcwd()

# Importing the news_final_1 CSV file
news_final_1 = pd.read_csv(Path_news_final_1+'/news_API_final.csv', sep = ';', encoding = 'utf-8')

# List comprehension to find all CSV files in the folder
csv_files = [file for file in os.listdir(Path_sentiment) if file.endswith('.csv')]

# Store all the dataframes in one dataframe 
list_of_dfs = [pd.read_csv(os.path.join(Path_sentiment, csv_file)) for csv_file in csv_files]

# Concatenate all the dataframes in one dataframe
sentiment_df = pd.concat(list_of_dfs)


In [4]:
# Replace typos in LLM_score
sentiment_df['LLM_score'] = sentiment_df['LLM_score'].replace('0 Ul', 0)
sentiment_df['LLM_score'] = sentiment_df['LLM_score'].replace('0 Ve', 0)

# convert LLM_score to integer
sentiment_df['LLM_score'] = sentiment_df['LLM_score'].astype('int32')

# Check the value counts of LLM_score
sentiment_df['LLM_score'].value_counts()

LLM_score
 0    92126
 1    55827
-1    32383
Name: count, dtype: int64

In [5]:
# Grouping by date and company and summing and taking the mean of the LLM_score
sentiment_agg = sentiment_df.groupby(['date', 'company'])['LLM_score'].agg(['sum', 'mean', 'count'])

# Resetting the index
sentiment_agg = sentiment_agg.reset_index()

# Joining the news_final_1 dataframe with the sentiment_agg dataframe 
news_sent_merge = pd.merge(news_final_1, sentiment_agg,  how='inner', on=['date', 'company'])

In [11]:
# Importing the TickerNames_final CSV file
Path_tickers = os.getcwd()+'/TickerNames_2022.csv'
TickerNames_final = pd.read_csv(Path_tickers)

# Set 'Name' column as the index of TickerNames_final
TickerNames_final.set_index('Name', inplace=True)

# Create a dictionary to map company names to tickers using the index
ticker_mapping = TickerNames_final['Symbol'].to_dict()

# Apply the mapping to the 'company' column
news_sent_merge['Ticker'] = news_sent_merge['company'].map(ticker_mapping)

# Convert the timestamp column to datetime
news_sent_merge['date'] = pd.to_datetime(news_sent_merge['date'])

# Create a timedelta to represent the GMT-4 offset
gmt_minus_4 = timedelta(hours=-4)

# Define a function to apply the conversion
def convert_to_gmt_minus_4(timestamp):
    converted_datetime = timestamp + gmt_minus_4
    return converted_datetime.strftime("%Y-%m-%d")

# Apply the conversion function to the timestamp column
news_sent_merge['New_date'] = news_sent_merge['date'].apply(convert_to_gmt_minus_4)


# Change column names
news_sent_final = news_sent_merge[['New_date', 'Ticker', 'sum', 'mean']]
colnames = ['date','Ticker','LLM_score_sum','LLM_score_mean']
news_sent_final.columns = colnames


In [12]:
# Save news_sent_final as a CSV file
news_sent_final.to_csv(os.getcwd()+'News_sentiment_final.csv', sep = ';', encoding = 'utf-8')

In [11]:
# Read Master dataframe
Path_Master = os.getcwd()+'/Final Scripts/Data processing/'

Master_df = pd.read_csv(Path_Master+'Master_with_technicals_news.csv', encoding='utf-8')

# Final dataframe
Final_df = pd.merge(Master_df, news_sent_final,  how='left', on = ['date', 'Ticker'])

# Count instances of LLM_score_sum
Final_df['LLM_score_mean'].value_counts()

# If LLM_score_mean is NaN, then it is set to 0 to indicate that there is no relevant news
Final_df['LLM_score_mean'] = Final_df['LLM_score_mean'].fillna(0)
Final_df['LLM_score_sum'] = Final_df['LLM_score_sum'].fillna(0)

# Remove duplicates in Final_df
Final_df = Final_df.drop_duplicates(subset=['date','Ticker'], keep='first')

In [12]:
# Save Final_df as a CSV file
Final_df.to_csv('Final scripts/Final_df.csv', sep = ';', encoding = 'utf-8')