In [16]:
# importing the required libraries
import pandas as pd
import numpy as np
import boto3
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce
import tqdm
import re
import pickle

from spacy.lang.en.stop_words import STOP_WORDS as spacy_stop_words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stop_words
import spacy

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='seaborn')
warnings.filterwarnings('ignore', category=UserWarning, module='matplotlib')


# defining the lemmatizer and stopwords
STOP_WORDS = set(spacy_stop_words).union(set(nltk_stop_words.words("english")))
SPACY_TOKENIZER = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prady\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# importing the data
df = pd.read_csv("../data/raw.csv")

# removing the unwanted column
df.drop(columns=['article_id.1'],inplace=True)

# making the date as datetime dtype
df['published date'] = pd.to_datetime(df['published date'])

In [4]:
df.head()

Unnamed: 0,article_id,category,subcategory,title,published date,text,source
0,0,asia media,asia - business & finance,EY achieves highest growth in nearly two decad...,2022-09-21 07:00:00,"US$3.2b invested in audit quality, innovation,...","{'href': 'https://www.ey.com', 'title': 'Ernst..."
1,1,asia media,asia - business & finance,Illuminate Financial Announces Strategic Partn...,2022-11-29 08:00:00,"LONDON, Nov. 29, 2022 /PRNewswire/ -- Illumina...","{'href': 'https://finance.yahoo.com', 'title':..."
2,2,asia media,asia - business & finance,Philip Morris International Announces New Regi...,2022-11-25 08:00:00,"LAUSANNE, Switzerland, November 25, 2022--(BUS...","{'href': 'https://finance.yahoo.com', 'title':..."
3,3,asia media,asia - business & finance,18 Thailand Companies and Entrepreneurs Win Co...,2022-07-02 07:00:00,"SINGAPORE, July 2, 2022 /PRNewswire/ -- Mr. Wi...","{'href': 'https://finance.yahoo.com', 'title':..."
4,4,asia media,asia - business & finance,"Meihua International Medical Technologies Co.,...",2022-12-05 08:00:00,"YANGZHOU, China, Dec. 5, 2022 /PRNewswire/ -- ...","{'href': 'https://finance.yahoo.com', 'title':..."


### Preprocessing

In [5]:
# defining a function which splits given article into paragraphs
def text_to_paragraphs(text:str)->list:
    """
    splits given article into paragraphs and returns list of paragraphs
    
    """
    # if the text is not a str then return an empty list
    if not isinstance(text, str):
        return []
    # split the text into paragraphs
    paragraphs = list(filter(lambda x: x!="", text.split("\n")))
    return paragraphs

In [6]:
def text_to_sentences(text:str)->list:
    """
    splits the given text into sentences and returns a list of sentences
    """
    # if the text is not a str then return an empty list
    if not isinstance(text, str):
        return []
     # split the text into sentences
    sentences = SPACY_TOKENIZER(text).sents
    sentences = [str(sentence) for sentence in sentences]
    return sentences

In [7]:
# defining a function to seprarate capitalized words
def separate_capitalilzed_words(text:str)->str:
    """
    "ThisIsAWord" -> "This Is A Word"
    
    """
    assert isinstance(text, str)
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    return text

In [8]:
# defining a function to preprocess the text
def preprocess_text(text:str)->str:
    """
    text : string to preprocess
    returns preprocessed text
    """
    
    # typecasting text to the str
    text = str(text)
    # separating the capitalized words
    text = separate_capitalilzed_words(text)
    # lowercasing all letters in the text
    text = text.lower()
    # removing non_alphanumeric elements
    pattern = '[^a-zA-Z0-9%\ \n]+'
    text = re.sub(pattern, '', text)
    # removing digits 
    text = re.sub(r'\d+', '', text)
    # fixing white spaces
    text = "\n".join(" ".join(text.split()).split("\n"))
    # removing stopwords
    text = " ".join([word for word in text.lower().split() if word not in STOP_WORDS])
    # tokenization
    text = " ".join([token.text for token in SPACY_TOKENIZER(text)])
    # lemmatization
    text = " ".join([token.lemma_ for token in SPACY_TOKENIZER(text)])
    
    return text


  pattern = '[^a-zA-Z0-9%\ \n]+'


In [9]:
# defining a function to pre-process (splitting the article into sentences or paragraphs each split is called a section)
def preprocess_article(article_id:int,title:str,text:str,section_by:str="paragraph",input_type=['title','text'])->list:
    """
    article_id    : unique ID of the article
    title         : title of the article
    text          : text/body of the article
    section_by    : based on what we are going to split our article (eg: sentence,paragraph) default is by paragraph
    input_type   : which features are we going to consider eg: ['title'],['text'],['title','text'] default is ['title','text']
    
    """
    # if we are going to use only 'title' for generating embeddings
    if input_type==['title']:    
        sections = [preprocess_text(title)]
        
    # if we are going to use 'text' feature for generating embeddings
    elif input_type==['text']:    
        if section_by=="paragraph": # if we are going to split the text into paragraphs
            sections = text_to_paragraphs(text)
        elif section_by=="sentence":  # if we are going to split the text into sentences
            sections = text_to_sentences(text)
        elif section_by==None: # if we are not going to split
            sections = [text]
        else:
            raise ValueError("Invalid section_by value, it must be either 'paragraph' or 'sentence'.")
            
    # if we are going to use both 'title' and 'text' features
    elif input_type==['title','text']:
        if section_by=="paragraph":  # if we are going to split the text into paragraphs
            sections = text_to_paragraphs(text)
        elif section_by=="sentence":  # if we are going to split the text into sentences
            sections = text_to_sentences(text)
        elif section_by==None:
            sections = [text]
        else:
            raise ValueError("Invalid section_by value, it must be either 'paragraph' or 'sentence'.")
        # preprocessing the 'title'
        title = preprocess_text(title)
        # prprocessing the sections (text splitted into)
        section = list(map(preprocess_text,sections))
        # combining the 'title' and sections 
        sections = [title] + section
        # filtering the sentences which are greater than 2 words
        sections = list(filter(lambda x:len(x.split())>2,sections))

    # edge case
    else:
        raise ValueError("Invalid input_type value. Allowed values are ['title'], ['text'] and ['title', 'text']")

    # getting the no of splits 
    section_count = len(sections)

    return list(zip([article_id]*section_count,sections))  # [ (1,sections), (2,sections) ]


In [17]:
# defining a function which preprocess the data
def preprocess_data(raw_file_path,section_by,input_type):
    
    """
    raw_file_path   : path of the raw data
    section_by      : based on what we are going to split our article (eg: sentence,paragraph)
    input_type      : which features are we going to consider eg: ['title'],['text'],['title','text']
    
    """
    # reading the data from the raw_file_path
    data = pd.read_csv(raw_file_path,nrows=5)

    # if we don't want to split our article
    if section_by==None:
        
        if input_type==['title']:   # if we are going to use only title feature for embedding
            data = data[['article_id','title']]
            data['title'] = data['title'].apply(preprocess_text)  
        elif input_type==['text']:  # if we are going to use only text feature for embedding
            data = data[['article_id','text']]
            data['text'] = data['text'].apply(preprocess_text)
        else:  # if we want to use both title and text for embeddings
            a = list(reduce(lambda x, y: x+y, map(lambda x: [(x[0], preprocess_text(x[1])), (x[0], preprocess_text(x[2]))], tqdm.tqdm(data[["article_id", "title", "text"]].values, desc="Processing data", leave=True))))
            data = pd.DataFrame(a,columns=["article_id", "text"])


    # if we want to split the text by either parapgraph or sentence wise
    elif section_by=="paragraph" or section_by=="sentence":
            data = list(reduce(lambda x, y: x+y, map(lambda x: preprocess_article(x[0],x[1],x[2],section_by,input_type), tqdm.tqdm(data[["article_id", "title", "text"]].values, desc="Processing data", leave=True))))
            data = pd.DataFrame(data,columns=['article_id','text'])
    else:
        raise ValueError("Invalid section_by value. Allowed values are 'paragraph' and 'sentence' and None")
    
    # generating id for each splits
    data['section_id']= range(data.shape[0])

    # generating a mapping from article_id to section_id to find the article_id for a given section_id by creating  a dictionary where article_id is the key and corresponding section_ids is the value
    article_section_mapping = data.groupby('article_id')['section_id'].apply(list).to_dict()

    # generating a mapping from section_id to article_id to find the article_id of a given section_id by creating  a dictionary where section_id is the key and the corresponding article_id is the value
    section_article_mapping = data.set_index('section_id')['article_id'].to_dict()

    # Pickling the dictionaries
    article_section_mapping_path = '../artifacts/article_section_mapping.pkl'
    with open(article_section_mapping_path, 'wb') as f:
        pickle.dump(article_section_mapping, f)

    section_article_mapping_path = '../artifacts/section_article_mapping.pkl'
    with open(section_article_mapping_path, 'wb') as f:
        pickle.dump(section_article_mapping, f)

    return data,article_section_mapping_path,section_article_mapping_path

### Testing

In [18]:
# testing the function
preprocessed_df1,article_section_mapping_path,section_article_mapping_path = preprocess_data("../data/raw.csv","sentence",['title','text'])
preprocessed_df1.head()

Processing data: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]


Unnamed: 0,article_id,text,section_id
0,0,ey achieve high growth nearly decade report re...,0
1,0,usb invest audit quality innovation technology...,1
2,0,mark successful year history organization high...,2
3,0,ey today publish value realize report expand a...,3
4,0,carmine di sibio ey global chairman ceo say ey...,4


In [None]:
# import pandas as pd
# BUCKET_NAME="search-relevancy"
# FILE_NAME="preprocessed.csv"
# pradyu=pd.read_csv(f's3://{BUCKET_NAME}/{FILE_NAME}',index_col=0)
# pradyu.article_id.value_counts()