# Exploratory data analysis, data preparation, and pre-processing

We will be using the **[NYT Articles: 2.1M+ (2000-Present) Daily Updated](https://www.kaggle.com/datasets/aryansingh0909/nyt-articles-21m-2000-present/)** dataset, available on Kaggle and maintained by Aryan Singh. We downloaded the dataset and stored it in `data/nyt-metadata.csv`. Since the original file is rather large (over 4GB), we are only going to analyze articles from a shorter time window, say from May 1st, 2023 to April 30th, 2024. We will store the corresponding data in a separate `.csv` file for later convenience.  
Note that dates are stored in the original `.csv` fiile as strings in the format `YYYY-MM-DD hh:mm:ss+hh:mm`. Since string comparison is built-in in python using lexicographic order, we can set the start and end date for our time interval as strings in the appropriate format.

In [27]:
import pandas as pd
from os import listdir

original_data_loc = 'data/nyt-metadata.csv'
data_loc = 'data/nyt-metadata_trimmed.csv'

def generate_short_csv(source:str=original_data_loc, dest:str=data_loc, start_date:str='2023-05-01', end_date: str='2024-05-01', chunksize=20000) -> None:
    # If a file already exists at the destination, we should check with the user if they really want to run the process all over again.
    execute = True
    if dest.split('/')[-1] in listdir(''.join(dest.split('/')[:-1])):
        execute = input(f'A file already exists at `{dest}`. Do you want to continue?').lower() in ['y', 'yes']

    if execute:
        df = pd.DataFrame(columns=pd.read_csv(source, nrows=1).columns)
        df.head()
        for chunk in pd.read_csv(source, chunksize=chunksize):
            chunk = chunk.loc[chunk.pub_date >= start_date]
            chunk = chunk.loc[chunk.pub_date < end_date]
            if not chunk.empty:
                df = pd.concat([df, chunk]) if not df.empty else chunk
        
        df.to_csv(dest)



In [28]:
generate_short_csv()

In [36]:
df = pd.read_csv(data_loc)
df.pub_date.apply(lambda x: x[:10]).unique()

array(['2023-05-01', '2023-05-02', '2023-05-03', '2023-05-04',
       '2023-05-05', '2023-05-06', '2023-05-07', '2023-05-08',
       '2023-05-09', '2023-05-10', '2023-05-11', '2023-05-12',
       '2023-05-13', '2023-05-14', '2023-05-15', '2023-05-16',
       '2023-05-17', '2023-05-18', '2023-05-19', '2023-05-20',
       '2023-06-01', '2023-07-01', '2023-08-01', '2023-09-01',
       '2023-10-01', '2023-11-01', '2023-12-01', '2023-12-02',
       '2023-12-03', '2023-12-04', '2023-12-05', '2023-12-06',
       '2023-12-07', '2023-12-08', '2023-12-09', '2023-12-10',
       '2023-12-11', '2023-12-12', '2023-12-13', '2023-12-14',
       '2023-12-15', '2023-12-16', '2023-12-17', '2023-12-18',
       '2023-12-19', '2023-12-20', '2023-12-21', '2023-12-22',
       '2023-12-23', '2023-12-24', '2023-12-25', '2023-12-26',
       '2023-12-27', '2023-12-28', '2023-12-29', '2023-12-30',
       '2023-12-31', '2024-01-01', '2024-02-01', '2024-03-01',
       '2024-04-01'], dtype=object)