In [1]:
import pandas as pd
import pendulum
from sklearn.preprocessing import MultiLabelBinarizer

In [270]:
stock_df = pd.read_csv("./stock_price_F.csv")
sentiment_df = pd.read_csv("./SintimentOP.csv")
sentiment_df= sentiment_df[['Sentiment','neutral', 'positive', 'negative', 'TimeUpDated']]
sentiment_df[['neutral', 'positive', 'negative']] = sentiment_df[['neutral', 'positive', 'negative']].fillna(0)

In [271]:
stock_df["Date"] = stock_df["Date"].apply(lambda x: pendulum.from_format(x[0:10], "YYYY-MM-DD").start_of("week"))
sentiment_df["Time"] = sentiment_df["TimeUpDated"].apply(lambda x: pendulum.from_format(x, "DD-MMM-YY").start_of("week"))

In [272]:
date_grouped_df = sentiment_df.groupby("Time").agg(list).reset_index()
date_grouped_df=date_grouped_df[['Time','Sentiment', 'neutral', 'positive', 'negative']]
date_grouped_df = date_grouped_df.rename(columns={"Time":"Date"})

In [273]:
result = pd.merge(stock_df, date_grouped_df, how="outer", on=["Date"])
result["POSITIVE"] = result["positive"].apply(lambda x: max(x) if isinstance(x,list) else x)
result["NEGATIVE"] = result["negative"].apply(lambda x: max(x) if isinstance(x,list) else x)
result["NEUTRAL"] = result["neutral"].apply(lambda x: max(x) if isinstance(x,list) else x)

In [274]:
print(stock_df.shape,date_grouped_df.shape)
result.drop(result.tail(1).index,
        inplace = True)

(54, 8) (52, 5)


In [275]:
result.tail(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Sentiment,neutral,positive,negative,POSITIVE,NEGATIVE,NEUTRAL
50,2023-12-18 00:00:00+00:00,11.99,12.46,11.79,12.35,244271600,0.0,0,"[{'negative': 0.9121586084365845, 'neutral': 0...","[0.9478311538696288, 0.948502779006958, 0.9452...","[0.958598792552948, 0.0, 0.9579848647117616, 0...","[0.9121586084365844, 0.9753839373588562, 0.975...",0.958599,0.976122,0.948503
51,2023-12-25 00:00:00+00:00,12.39,12.5,12.16,12.19,133775900,0.0,0,"[{'negative': 0.9754379391670227, 'neutral': 0...","[0.9478311538696288, 0.947828471660614, 0.9567...","[0.9525155425071716, 0.9577768445014954, 0.949...","[0.9754379391670228, 0.959048330783844, 0.9702...",0.957777,0.976017,0.956757
52,2024-01-01 00:00:00+00:00,12.04,12.38,11.63,11.85,216944500,0.0,0,,,,,,,


In [276]:
result.to_csv("Ford_with_sentiment_scores.csv")

In [277]:
emotion_df = pd.read_csv("./nehal/latest_content/fordnewstitle_cleaned_emotion.csv")
emotion_df = emotion_df[['Time','label_1','score_1', 'label_2', 'score_2']]
emotion_df[['score_1', 'score_2']] = emotion_df[['score_1', 'score_2']].fillna(0)

In [278]:
emotion_df["Time"] = emotion_df["Time"].apply(lambda x: pendulum.from_format(x, "DD-MMM-YY").start_of("week"))

In [279]:
emotion_grouped = emotion_df.groupby("Time").agg(list).reset_index()
emotion_grouped=emotion_grouped[['Time', 'label_1', 'label_2']]
emotion_grouped = emotion_grouped.rename(columns={"Time":"Date","label_1":"primary","label_2":"secondary"})

In [280]:
emotion_grouped["primary"] = emotion_grouped["primary"].apply(lambda x: list(set(x)))
emotion_grouped["secondary"] = emotion_grouped["secondary"].apply(lambda x: list(set(x)))

In [281]:
mlb = MultiLabelBinarizer(sparse_output=True)

In [282]:
emotion_grouped = emotion_grouped.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(emotion_grouped.pop('primary')),
                index=emotion_grouped.index,
                columns=mlb.classes_))

In [283]:
emotion_grouped= emotion_grouped.rename(columns={"anger":"primary_anger","fear":"primary_fear","joy":"primary_joy",\
                                                 "neutral":"primary_neutral","sadness":"primary_sadness",\
                                                    "surprise":"primary_surprise"})

In [284]:
emotion_grouped = emotion_grouped.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(emotion_grouped.pop('secondary')),
                index=emotion_grouped.index,
                columns=mlb.classes_))

In [285]:
emotion_grouped= emotion_grouped.rename(columns={"anger":"secondary_anger","fear":"secondary_fear","joy":"secondary_joy",\
                                                 "neutral":"secondary_neutral","sadness":"secondary_sadness",\
                                                    "surprise":"secondary_surprise"})

In [286]:
final_df = pd.merge(result, emotion_grouped, how="outer", on=["Date"])

In [288]:
target_column = list(final_df["Close"])[1:]
final_df.drop(final_df.tail(1).index,
        inplace = True)
final_df["Target_closing_price"] = target_column

In [289]:
final_df.shape

(52, 28)

In [290]:
final_df.tail(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Sentiment,neutral,...,primary_neutral,primary_sadness,primary_surprise,secondary_anger,secondary_fear,secondary_joy,secondary_neutral,secondary_sadness,secondary_surprise,Target_closing_price
49,2023-12-11 00:00:00+00:00,11.02,12.18,10.8,12.02,344023100,0.0,0,"[{'negative': 0.9780202507972717, 'neutral': 0...","[0.9478311538696288, 0.953544557094574, 0.9497...",...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,12.35
50,2023-12-18 00:00:00+00:00,11.99,12.46,11.79,12.35,244271600,0.0,0,"[{'negative': 0.9121586084365845, 'neutral': 0...","[0.9478311538696288, 0.948502779006958, 0.9452...",...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,12.19
51,2023-12-25 00:00:00+00:00,12.39,12.5,12.16,12.19,133775900,0.0,0,"[{'negative': 0.9754379391670227, 'neutral': 0...","[0.9478311538696288, 0.947828471660614, 0.9567...",...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,11.85


In [291]:
final_df.to_csv("./final_ford_analysis_data.csv")

  final_df.to_csv("./final_ford_analysis_data.csv")
