In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
aapl_datetime = pd.read_csv('aapl.csv')

In [11]:
columns = aapl_datetime.columns
print(columns)

Index(['Unnamed: 0', 'time', 'headline', 'stock'], dtype='object')


In [12]:
aapl_datetime['time']

0         3/3/2025 6:30:36 PM
1         3/3/2025 6:02:56 PM
2         3/3/2025 5:44:25 PM
3         3/3/2025 5:11:24 PM
4         3/3/2025 4:06:51 PM
                 ...         
20190    6/11/2011 7:36:28 AM
20191     6/6/2011 9:33:12 PM
20192     6/5/2011 7:56:02 PM
20193    5/21/2011 5:33:49 AM
20194    5/19/2011 9:53:02 AM
Name: time, Length: 20195, dtype: object

In [15]:
# Convert the 'time' column to 'datetime' format so that we can extract the date and time separately in the next step
aapl_datetime['time'] = pd.to_datetime(aapl_datetime['time'], format='%m-%d-%Y %I:%M:%S %p')

In [16]:
# Extract the date and time  and put them in separate columns
aapl_datetime['date'] = aapl_datetime['time'].dt.date
aapl_datetime['exact_time'] = aapl_datetime['time'].dt.time

In [17]:
aapl_datetime['date']

0        2025-03-03
1        2025-03-03
2        2025-03-03
3        2025-03-03
4        2025-03-03
            ...    
20190    2011-06-11
20191    2011-06-06
20192    2011-06-05
20193    2011-05-21
20194    2011-05-19
Name: date, Length: 20195, dtype: object

In [18]:
aapl_datetime['exact_time']

0        18:30:36
1        18:02:56
2        17:44:25
3        17:11:24
4        16:06:51
           ...   
20190    07:36:28
20191    21:33:12
20192    19:56:02
20193    05:33:49
20194    09:53:02
Name: exact_time, Length: 20195, dtype: object

In [19]:
# Create a csv
aapl_datetime.to_csv('aapl_datetime.csv')

In [23]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(aapl_datetime['headline'])

# Use a dictionary to track headlines to remove
to_remove = set()

# Compare within ±3-day windows
for i in range(len(aapl_datetime)):
    if i in to_remove:
        continue
    current_date = aapl_datetime.loc[i, 'date']

    # Get indices of nearby dates (±3 days)
    nearby_indices = aapl_datetime[(aapl_datetime['date'] >= current_date - pd.Timedelta(days=3)) & 
                        (aapl_datetime['date'] <= current_date + pd.Timedelta(days=3))].index

    for j in nearby_indices:
        if i != j and j not in to_remove:
            similarity_score = cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0][0]
            if similarity_score > 0.7:
                to_remove.add(j)

# Remove duplicates
aapl_datetime_cleaned = aapl_datetime.drop(list(to_remove)).reset_index(drop=True)

print(aapl_datetime_cleaned)

       Unnamed: 0                time  \
0               0 2025-03-03 18:30:36   
1               1 2025-03-03 18:02:56   
2               2 2025-03-03 17:44:25   
3               3 2025-03-03 17:11:24   
4               4 2025-03-03 16:06:51   
...           ...                 ...   
17680       20190 2011-06-11 07:36:28   
17681       20191 2011-06-06 21:33:12   
17682       20192 2011-06-05 19:56:02   
17683       20193 2011-05-21 05:33:49   
17684       20194 2011-05-19 09:53:02   

                                                headline stock        date  \
0      AI Daily: Analyst sees Apple, Alibaba partners...  aapl  2025-03-03   
1      Apple’s iPhone 16e Is Likely to Underwhelm, Sa...  aapl  2025-03-03   
2      Is a $100B TSMC U.S. Manufacturing Investment ...  aapl  2025-03-03   
3      Trump announces crypto reserve, Kroger CEO res...  aapl  2025-03-03   
4      Apple CEO teases ‘something in the Air’ this week  aapl  2025-03-03   
...                                    

In [25]:
aapl_datetime_cleaned.to_csv('aapl_datetime_TF_IDF.csv', index=False)