# Relevant Imports

In [1]:
# Cell 1: Relevant Imports
import pandas as pd
import numpy as np
from datetime import datetime


# Preprocessing

In [2]:
# Cell 2: Load the CSV file
# Assuming the file is named 'sp500_news_simple_2024_AAPL.csv' and is in the current directory
df = pd.read_csv('sp500_news_simple_2024_AAPL.csv')
# Display the first few rows to inspect the data
df.head()


Unnamed: 0,time_published,title,summary
0,2024-01-06 14:12:42,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...
1,2024-01-06 15:04:00,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...
2,2024-01-07 11:01:10,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w..."
3,2024-01-07 11:45:00,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni..."
4,2024-01-07 12:45:00,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...


In [3]:
# Cell 3: Extract date from 'time_published' column without timestamp
# Convert 'time_published' to datetime format
df['time_published'] = pd.to_datetime(df['time_published'])
# Extract date in DD/MM/YYYY format
df['date'] = df['time_published'].dt.strftime('%d/%m/%Y')
# Display the updated DataFrame
df.head()


Unnamed: 0,time_published,title,summary,date
0,2024-01-06 14:12:42,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...,06/01/2024
1,2024-01-06 15:04:00,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...,06/01/2024
2,2024-01-07 11:01:10,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w...",07/01/2024
3,2024-01-07 11:45:00,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni...",07/01/2024
4,2024-01-07 12:45:00,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...,07/01/2024


In [4]:
# Cell 4: Select only the required columns: date, title, and summary (assuming 'summary' is the content)
# Rename 'summary' to 'content' if needed for clarity
df_cleaned = df[['date', 'title', 'summary']].rename(columns={'summary': 'content'})
# Display the cleaned DataFrame
df_cleaned.head()


Unnamed: 0,date,title,content
0,06/01/2024,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...
1,06/01/2024,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...
2,07/01/2024,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w..."
3,07/01/2024,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni..."
4,07/01/2024,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...


In [5]:
# Cell 5: Optional - Handle any missing values or duplicates if necessary
# Drop rows with missing values (if any)
df_cleaned = df_cleaned.dropna()
# Remove duplicates (if any)
df_cleaned = df_cleaned.drop_duplicates()
# Display info about the cleaned DataFrame
df_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6977 entries, 0 to 7005
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     6977 non-null   object
 1   title    6977 non-null   object
 2   content  6977 non-null   object
dtypes: object(3)
memory usage: 218.0+ KB


In [6]:
# Cell 6: Save the cleaned data to a new CSV file
output_filename = 'cleaned_sp500_news_2024_AAPL.csv'
df_cleaned.to_csv(output_filename, index=False)
print(f"Cleaned data saved to {output_filename}")


Cleaned data saved to cleaned_sp500_news_2024_AAPL.csv
