In [1]:
# imports

import csv
import re
import unicodedata
import emoji
import contractions
from textblob import TextBlob
import tqdm

In [2]:
# cleaning

def remove_html(str):
    return re.sub('<.*?>', '', str)

def remove_email(str):
    return re.sub(r'[\w\.-]+@[\w\.-]+', '', str)

def remove_url(str):
    return re.sub(r'http\S+', '', str)

def replace_accented(str):
    return unicodedata.normalize('NFKD', str).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def remove_wording(str):
    return str.replace('Title:', '')

def remove_emoji(str):
    return re.sub(':.*?:', '', emoji.demojize(str))

def remove_symbols(str):
    return str.replace('\\', '').replace(' .', '').replace('\u200b', ' ').replace('"', '')

def remove_spaces(str):
    return ' '.join(str.split())

def remove_contractions(str):
    return contractions.fix(str)

def fix_grammar(str):
    return TextBlob(str).correct()

def clean(str):
    str = remove_html(str)
    str = remove_email(str)
    str = remove_url(str)
    str = replace_accented(str)
    str = remove_wording(str)
    str = remove_emoji(str)
    str = remove_symbols(str)
    str = remove_spaces(str)
    str = remove_contractions(str)
    str = fix_grammar(str)
    return str

with open('data/reviews_simple_raw.csv', 'r', newline='') as f:
    reader = csv.DictReader(f)
    raw = list(reader)
    for row in tqdm.tqdm(raw):
       row['title'] = clean(row['title'])
       row['body'] = clean(row['body'])
    with open('data/reviews_simple_cleaned.csv', 'w', newline='') as f:
        headers = ['title', 'body', 'rating_item', 'rating_delivery', 'rating_seller', 'rating_marketplace', 'rating_total']
        writer = csv.DictWriter(f, fieldnames=headers, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
        writer.writeheader()
        for row in tqdm.tqdm(raw):
            writer.writerow({
                'title': row['title'],
                'body': row['body'],
                'rating_item': row['rating_item'],
                'rating_delivery': row['rating_delivery'],
                'rating_seller': row['rating_seller'],
                'rating_marketplace': row['rating_marketplace'],
                'rating_total': row['rating_total']
            })

100%|██████████| 1200/1200 [26:16<00:00,  1.31s/it]
100%|██████████| 1200/1200 [00:00<00:00, 14571.78it/s]
