# Data Cleaning + Preprocessing (Stock + News)

## Stock Data Cleaning

In [3]:
import pandas as pd

stock_data = pd.read_csv("..\data\SYK_2019-04-01_to_2025-05-06.csv")

In [4]:
stock_data

Unnamed: 0,Price,Close,High,Low,Open,Volume
0,Ticker,SYK,SYK,SYK,SYK,SYK
1,Date,,,,,
2,2019-04-01,183.80801391601562,187.32295542918013,183.2081311365289,186.76055906614033,1625900
3,2019-04-02,182.936279296875,184.10792632903386,182.61759473668747,183.7142546425083,919700
4,2019-04-03,182.626953125,184.53908898986603,181.92396494280862,183.96732469625712,1154700
...,...,...,...,...,...,...
1530,2025-04-29,370.9200134277344,372.32000732421875,365.80999755859375,366.9800109863281,1108300
1531,2025-04-30,373.9200134277344,375.1600036621094,366.55999755859375,370.0,2251700
1532,2025-05-01,373.989990234375,376.17999267578125,368.5400085449219,372.4200134277344,1634300
1533,2025-05-02,378.2200012207031,379.3299865722656,363.0799865722656,373.8900146484375,3709100


In [19]:
def StockDataLoder(path):
    """
    StockDataLoder:
    
    Loads and preprocesses stock data exported from Yahoo Finance (via yfinance).

    Parameters:
    -----------
    path : str
        Path to the CSV file containing stock data.

    Returns:
    --------
    pd.DataFrame or int
        A cleaned DataFrame with datetime index and all numeric columns converted to float.
        Returns 0 if the file does not exist or an error occurs.

    Tasks Performed:
    ----------------
    - Check if file exists
    - Load CSV data
    - Drop header rows (if present) that contain metadata
    - Rename 'Price' column to 'Date'
    - Convert 'Date' column to datetime format
    - Convert all other columns to float
    """
    import pandas as pd
    import numpy as np
    import os

    try:
        # Check if file exists
        if not os.path.exists(path):
            print("❌ File not found.")
            return 0

        # Load CSV
        stock_data = pd.read_csv(path)

        # Drop metadata rows if present (e.g., 'Ticker' and 'Date' rows in 'Price' column)
        if stock_data['Price'][0] == 'Ticker':
            stock_data = stock_data[1:]
        if stock_data['Price'][1] == 'Date':
            stock_data = stock_data[1:]

        # Rename 'Price' to 'Date' for clarity
        stock_data = stock_data.rename(columns={'Price': 'Date'})

        # Convert 'Date' column to datetime
        stock_data['Date'] = pd.to_datetime(stock_data['Date'])

        # Convert all other columns to float
        ignore = ['Date']
        stock_data = (
            stock_data.set_index(ignore, append=True)
                      .astype(float)
                      .reset_index(ignore)
        )

        # Set 'Date' as index
        stock_data.set_index('Date', inplace=True)

        return stock_data

    except Exception as e:
        print(f"⚠️ Error occurred while loading stock data: {e}")
        return 0

In [20]:
def StockDataCleaner(stock_data):
    """
    StockDataCleaner:
    
    Cleans and enriches raw stock price data.

    Parameters:
    -----------
    stock_data : pd.DataFrame
        Stock price data (ideally with 'Close' price column) loaded from StockDataLoader,
        with datetime index.

    Returns:
    --------
    pd.DataFrame
        Cleaned DataFrame with:
        - Duplicates removed
        - Missing values forward-filled
        - Daily percentage return
        - Daily log return
    """
    import pandas as pd
    import numpy as np

    # Remove duplicate index entries (e.g., same date entries)
    stock_data = stock_data[~stock_data.index.duplicated()]

    # Handle missing data using forward fill
    stock_data = stock_data.fillna(method='ffill')

    # Add daily percentage return
    stock_data['Daily_Return'] = stock_data['Close'].pct_change().fillna(0)

    # Add daily log return (more suitable for compounding returns)
    stock_data['Log_Return'] = stock_data['Close'].div(stock_data['Close'].shift(1)).apply(lambda x: np.log(x) if x > 0 else 0)

    return stock_data

In [21]:
syk_stock_data = StockDataCleaner(StockDataLoder('..\data\SYK_2019-04-01_to_2025-05-06.csv'))
syk_stock_data

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Daily_Return,Log_Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-01,183.808014,187.322955,183.208131,186.760559,1625900.0,0.000000,0.000000
2019-04-02,182.936279,184.107926,182.617595,183.714255,919700.0,-0.004743,-0.004754
2019-04-03,182.626953,184.539089,181.923965,183.967325,1154700.0,-0.001691,-0.001692
2019-04-04,181.567795,183.198718,181.202242,182.955021,729200.0,-0.005800,-0.005816
2019-04-05,183.001892,183.714248,181.633402,182.120811,640100.0,0.007898,0.007867
...,...,...,...,...,...,...,...
2025-04-29,370.920013,372.320007,365.809998,366.980011,1108300.0,0.012088,0.012015
2025-04-30,373.920013,375.160004,366.559998,370.000000,2251700.0,0.008088,0.008055
2025-05-01,373.989990,376.179993,368.540009,372.420013,1634300.0,0.000187,0.000187
2025-05-02,378.220001,379.329987,363.079987,373.890015,3709100.0,0.011310,0.011247


## News Data Cleaning

In [25]:
news_df = pd.read_csv("../data/news_data/STRYKER CORPORATION_2025-05-10_to_2019-05-15.csv", parse_dates=['publishedAt'])
news_df

Unnamed: 0,ticker,source,title,description,content,url,publishedAt
0,STRYKER CORPORATION,ETF Daily News,Optex Systems (NASDAQ:OPXS) & Odysight.Ai (NAS...,Odysight.Ai (NASDAQ:ODYS – Get Free Report) an...,Odysight.Ai (NASDAQ:ODYS – Get Free Report) an...,https://www.etfdailynews.com/2025/05/10/optex-...,2025-05-10 05:14:51+00:00
1,STRYKER CORPORATION,ETF Daily News,Stryker (NYSE:SYK) Given “Market Perform” Rati...,Stryker (NYSE:SYK – Get Free Report)‘s stock h...,Stryker (NYSE:SYK – Get Free Report)‘s stock h...,https://www.etfdailynews.com/2025/05/07/stryke...,2025-05-07 07:13:03+00:00
2,STRYKER CORPORATION,ETF Daily News,Truist Financial Increases Stryker (NYSE:SYK) ...,Stryker (NYSE:SYK – Free Report) had its price...,Stryker (NYSE:SYK – Free Report) had its price...,https://www.etfdailynews.com/2025/05/06/truist...,2025-05-06 07:46:52+00:00
3,STRYKER CORPORATION,ETF Daily News,Brokerages Set Stryker Co. (NYSE:SYK) Price Ta...,Shares of Stryker Co. (NYSE:SYK – Get Free Rep...,Shares of Stryker Co. (NYSE:SYK – Get Free Rep...,https://www.etfdailynews.com/2025/05/06/broker...,2025-05-06 06:18:58+00:00
4,STRYKER CORPORATION,ETF Daily News,Equities Analysts Issue Forecasts for Stryker ...,Stryker Co. (NYSE:SYK – Free Report) – Analyst...,Stryker Co. (NYSE:SYK – Free Report) – Analyst...,https://www.etfdailynews.com/2025/05/06/equiti...,2025-05-06 05:38:45+00:00
...,...,...,...,...,...,...,...
95,STRYKER CORPORATION,ETF Daily News,Insider Selling: Stryker Co. (NYSE:SYK) CEO Se...,Stryker Co. (NYSE:SYK – Get Free Report) CEO K...,Stryker Co. (NYSE:SYK – Get Free Report) CEO K...,https://www.etfdailynews.com/2024/11/10/inside...,2024-11-10 09:08:30+00:00
96,STRYKER CORPORATION,ETF Daily News,Stryker (NYSE:SYK) Stock Rating Lowered by Sto...,StockNews.com cut shares of Stryker (NYSE:SYK ...,StockNews.com cut shares of Stryker (NYSE:SYK ...,https://www.etfdailynews.com/2024/11/10/stryke...,2024-11-10 07:56:50+00:00
97,STRYKER CORPORATION,Investing.com,Stryker's SWOT analysis: medical device giant'...,Stryker's SWOT analysis: medical device giant'...,"Stryker Corporation (NYSE:SYK), a leading play...",https://www.investing.com/news/company-news/st...,2024-11-05 08:32:24+00:00
98,STRYKER CORPORATION,ETF Daily News,Needham & Company LLC Increases Stryker (NYSE:...,Stryker (NYSE:SYK – Get Free Report) had its p...,Stryker (NYSE:SYK – Get Free Report) had its p...,https://www.etfdailynews.com/2024/11/01/needha...,2024-11-01 14:14:47+00:00


In [36]:
def NewsDataCleaner(path):
    """
    NewsDataCleaner:
    
    Loads and cleans news articles from a CSV file for further analysis or modeling.
    
    Parameters:
    -----------
    path : str
        Path to the news CSV file (from News API).

    Returns:
    --------
    pd.DataFrame
        Cleaned DataFrame with:
        - Removed null titles/descriptions
        - Deduplicated URLs
        - Lowercased, punctuation-free combined 'clean_text' column
    """
    import pandas as pd

    try:
        # Load CSV with proper date parsing
        news_df = pd.read_csv(path, parse_dates=['publishedAt'])

        # Drop rows where title or description is missing
        news_df.dropna(subset=['title', 'description'], inplace=True)

        # Remove duplicate articles by URL
        news_df.drop_duplicates(subset=['url'], inplace=True)

        # Create a combined clean text column
        news_df['clean_text'] = news_df['title'] + '. ' + news_df['description']

        # Normalize text: lowercase + remove non-alphanumeric characters
        news_df['clean_text'] = news_df['clean_text'].str.lower()
        news_df['clean_text'] = news_df['clean_text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

        # reset index
        news_df.reset_index(drop=True, inplace=True)

        return news_df

    except Exception as e:
        print(f"⚠️ Error processing news data: {e}")
        return None

In [37]:
syk_news_data = NewsDataCleaner('../data/news_data/STRYKER CORPORATION_2025-05-10_to_2019-05-15.csv')
syk_news_data

Unnamed: 0,ticker,source,title,description,content,url,publishedAt,clean_text
0,STRYKER CORPORATION,ETF Daily News,Optex Systems (NASDAQ:OPXS) & Odysight.Ai (NAS...,Odysight.Ai (NASDAQ:ODYS – Get Free Report) an...,Odysight.Ai (NASDAQ:ODYS – Get Free Report) an...,https://www.etfdailynews.com/2025/05/10/optex-...,2025-05-10 05:14:51+00:00,optex systems nasdaqopxs odysightai nasdaqody...
1,STRYKER CORPORATION,ETF Daily News,Stryker (NYSE:SYK) Given “Market Perform” Rati...,Stryker (NYSE:SYK – Get Free Report)‘s stock h...,Stryker (NYSE:SYK – Get Free Report)‘s stock h...,https://www.etfdailynews.com/2025/05/07/stryke...,2025-05-07 07:13:03+00:00,stryker nysesyk given market perform rating at...
2,STRYKER CORPORATION,ETF Daily News,Truist Financial Increases Stryker (NYSE:SYK) ...,Stryker (NYSE:SYK – Free Report) had its price...,Stryker (NYSE:SYK – Free Report) had its price...,https://www.etfdailynews.com/2025/05/06/truist...,2025-05-06 07:46:52+00:00,truist financial increases stryker nysesyk pri...
3,STRYKER CORPORATION,ETF Daily News,Brokerages Set Stryker Co. (NYSE:SYK) Price Ta...,Shares of Stryker Co. (NYSE:SYK – Get Free Rep...,Shares of Stryker Co. (NYSE:SYK – Get Free Rep...,https://www.etfdailynews.com/2025/05/06/broker...,2025-05-06 06:18:58+00:00,brokerages set stryker co nysesyk price target...
4,STRYKER CORPORATION,ETF Daily News,Equities Analysts Issue Forecasts for Stryker ...,Stryker Co. (NYSE:SYK – Free Report) – Analyst...,Stryker Co. (NYSE:SYK – Free Report) – Analyst...,https://www.etfdailynews.com/2025/05/06/equiti...,2025-05-06 05:38:45+00:00,equities analysts issue forecasts for stryker ...
...,...,...,...,...,...,...,...,...
93,STRYKER CORPORATION,ETF Daily News,Insider Selling: Stryker Co. (NYSE:SYK) CEO Se...,Stryker Co. (NYSE:SYK – Get Free Report) CEO K...,Stryker Co. (NYSE:SYK – Get Free Report) CEO K...,https://www.etfdailynews.com/2024/11/10/inside...,2024-11-10 09:08:30+00:00,insider selling stryker co nysesyk ceo sells 5...
94,STRYKER CORPORATION,ETF Daily News,Stryker (NYSE:SYK) Stock Rating Lowered by Sto...,StockNews.com cut shares of Stryker (NYSE:SYK ...,StockNews.com cut shares of Stryker (NYSE:SYK ...,https://www.etfdailynews.com/2024/11/10/stryke...,2024-11-10 07:56:50+00:00,stryker nysesyk stock rating lowered by stockn...
95,STRYKER CORPORATION,Investing.com,Stryker's SWOT analysis: medical device giant'...,Stryker's SWOT analysis: medical device giant'...,"Stryker Corporation (NYSE:SYK), a leading play...",https://www.investing.com/news/company-news/st...,2024-11-05 08:32:24+00:00,strykers swot analysis medical device giants s...
96,STRYKER CORPORATION,ETF Daily News,Needham & Company LLC Increases Stryker (NYSE:...,Stryker (NYSE:SYK – Get Free Report) had its p...,Stryker (NYSE:SYK – Get Free Report) had its p...,https://www.etfdailynews.com/2024/11/01/needha...,2024-11-01 14:14:47+00:00,needham company llc increases stryker nysesyk...


## Join Stock Prices + News (Timestamp Match)

In [60]:
def StockNewsDataMerger(stock_data, news_data, stock_name='Stock', save=True):
    """
    Merges cleaned stock and news data on date for a given stock.

    Parameters:
    -----------
    stock_data : pd.DataFrame
        Cleaned stock price data with datetime index and 'Daily_Return', 'Close', etc.

    news_data : pd.DataFrame
        Cleaned news data with 'publishedAt' datetime and 'clean_text', 'title', 'description'.

    stock_name : str
        Name of the stock (used for saving output file).

    save : bool
        If True, saves merged file as CSV in '../data/MergedDataset/'

    Returns:
    --------
    pd.DataFrame
        Merged stock-news dataset.
    """
    import pandas as pd
    import os

    try:
        # Convert datetime to date to align both datasets
        news_data['date'] = news_data['publishedAt'].dt.date
        stock_data['Market_Date'] = stock_data.index.date

        # Merge stock and news data on date
        merged_data = pd.merge(
            news_data,
            stock_data.reset_index(),
            left_on='date',
            right_on='Market_Date',
            how='inner'
        )

        # Keep only relevant columns
        columns_to_keep = ['ticker', 'date', 'title', 'description', 'clean_text', 'Close', 'Daily_Return']
        merged_data = merged_data[[col for col in columns_to_keep if col in merged_data.columns]]

        # Save to CSV if required
        if save:
            os.makedirs("../data/merged_data", exist_ok=True)
            base_path = "../data/merged_data/"
            base_filename = f"{stock_name}_merged_news_price_data"
            i = 1
            while os.path.exists(os.path.join(base_path, f"{base_filename}_v{i}.csv")):
                i += 1
            output_path = base_path+ f"{base_filename}_v{i}.csv"
            merged_data.to_csv(output_path, index=False)
            print(f"✅ Merged dataset saved: {output_path}")

        return merged_data

    except Exception as e:
        print(f"⚠️ Error during merging stock and news data: {e}")
        return None

In [61]:
syk_merged_data= StockNewsDataMerger(stock_data=StockDataCleaner(StockDataLoder('..\data\SYK_2019-04-01_to_2025-05-06.csv')),
                                    news_data=NewsDataCleaner('../data/news_data/STRYKER CORPORATION_2025-05-10_to_2019-05-15.csv'),
                                    stock_name='Stryker',save=True)

✅ Merged dataset saved: ../data/merged_data/Stryker_merged_news_price_data_v1.csv


In [62]:
syk_merged_data

Unnamed: 0,ticker,date,title,description,clean_text,Close,Daily_Return
0,STRYKER CORPORATION,2025-05-05,Evercore ISI Issues Pessimistic Forecast for S...,Stryker (NYSE:SYK – Free Report) had its price...,evercore isi issues pessimistic forecast for s...,381.359985,0.008302
1,STRYKER CORPORATION,2025-05-01,Reviewing Microbot Medical (NASDAQ:MBOT) & Mot...,Microbot Medical (NASDAQ:MBOT – Get Free Repor...,reviewing microbot medical nasdaqmbot motus g...,373.989990,0.000187
2,STRYKER CORPORATION,2025-04-29,Stryker (SYK) Projected to Post Earnings on Th...,Stryker (NYSE:SYK – Get Free Report) is expect...,stryker syk projected to post earnings on thur...,370.920013,0.012088
3,STRYKER CORPORATION,2025-04-25,Head-To-Head Comparison: Optex Systems (NASDAQ...,Optex Systems (NASDAQ:OPXS – Get Free Report) ...,headtohead comparison optex systems nasdaqopxs...,365.059998,0.011107
4,STRYKER CORPORATION,2025-04-22,Head-To-Head Contrast: Spectral Medical (OTCMK...,Spectral Medical (OTCMKTS:EDTXF – Get Free Rep...,headtohead contrast spectral medical otcmktsed...,348.119995,0.025119
...,...,...,...,...,...,...,...
60,STRYKER CORPORATION,2024-11-13,Stryker Co. (NYSE:SYK) Stock Position Lowered ...,Covestor Ltd trimmed its position in shares of...,stryker co nysesyk stock position lowered by c...,386.546631,0.003177
61,STRYKER CORPORATION,2024-11-11,Leavell Investment Management Inc. Boosts Stoc...,Leavell Investment Management Inc. lifted its ...,leavell investment management inc boosts stock...,377.110382,0.007713
62,STRYKER CORPORATION,2024-11-05,Stryker's SWOT analysis: medical device giant'...,Stryker's SWOT analysis: medical device giant'...,strykers swot analysis medical device giants s...,368.151978,0.008068
63,STRYKER CORPORATION,2024-11-01,Needham & Company LLC Increases Stryker (NYSE:...,Stryker (NYSE:SYK – Get Free Report) had its p...,needham company llc increases stryker nysesyk...,365.454498,0.030510
