## Setup

In [1]:
import os
import re
import pandas as pd
from string import printable

In [2]:
ROOT_DIR = os.path.join('/notebooks/horoscope')
DATA_DIR = os.path.join(ROOT_DIR, 'data')

## Pre-process

### EDA based cleaning:

- Remove duplicate texts from all categories. ✅ 

- Replace special font characters with regular font characters. ✅ 

- Replace sign names in texts with a generic word like *friend*. ✅ 

In [3]:
import os
import re
import pandas as pd
from string import printable
from typing import List

# set globals
ROOT_DIR = os.path.join('/notebooks/horoscope')
DATA_DIR = os.path.join(ROOT_DIR, 'data')

def remove_duplicates(data_frame:pd.DataFrame) -> pd.DataFrame:
    """Remove duplicates from column text"""
    df_duplicated = data_frame[data_frame.duplicated(subset='text')]
    df_new = data_frame.drop(df_duplicated.index)
    return df_new

def replace_special_chars(text:str):
    """Replace special font characters with regular font characters"""
    new_str = re.sub("’", "'", text)
    new_str = re.sub("–", "-", new_str)
    new_str = re.sub('“', '"', new_str)
    new_str = re.sub('”', '"', new_str)
    new_str = re.sub("…", ".", new_str)
    return new_str
    
def replace_signs(all_signs:List, text:str):
    """Replace sign names in text with a generic word 'friend'"""
    for s in all_signs:
        if s in text:
            text = text.replace(s, 'friend')
    return text

if __name__ == "__main__":
    print("-"*50)
    df = pd.read_csv(os.path.join(DATA_DIR, 'horoscope_final.csv'), 
                     names=['sign', 'category', 'date', 'text'])
    print(f"Original Date Shape: {df.shape}")
    print("Removing duplicates...")
    df = remove_duplicates(df)
    all_signs = [s.title() for s in df.sign.value_counts().index]
    print(f"New Data Shape {df.shape}")
    df['text'] = df.text.apply(lambda x: replace_signs(all_signs, x))
    
    df = df[['category', 'text']]
    df.to_csv(os.path.join(DATA_DIR, 'horoscope_cleaned.csv'),
              header=False,
              index=False)
    print("File Saved.")
    print("Success!")
    print("-"*50)

--------------------------------------------------
Original Date Shape: (21960, 4)
Removing duplicates...
New Data Shape (12051, 4)
File Saved.
Success!
--------------------------------------------------


In [63]:
test = "Minor tensions could arise today, Aries. People close to you, perhaps your mate or parents, seem to be trying to force you to adopt a behavior that you aren't at all willing to follow. Will you negotiate your independence gently or tear yourself free from their domination? The second possibility seems more likely."

In [64]:
for s in all_signs:
    if s in test:
        test = test.replace(s, 'friend')
print(test)

Minor tensions could arise today, friend. People close to you, perhaps your mate or parents, seem to be trying to force you to adopt a behavior that you aren't at all willing to follow. Will you negotiate your independence gently or tear yourself free from their domination? The second possibility seems more likely.
