In [1]:
import emoji
import fasttext
from happytransformer import HappyTextToText, TTSettings
import json
import numpy as np
import os
import pandas as pd
import re
import spacy
from spellchecker import SpellChecker
import unicodedata

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def standardize_case(dataset):
    dataset["content"] = dataset["content"].str.lower()

In [3]:
def unicode_normalization(dataset):
    dataset["content"] = dataset["content"].apply(lambda s : unicodedata.normalize("NFKD", s
                                                        .encode("ascii", "ignore").decode("utf-8")))

In [4]:
def remove_whitespace(dataset):
    dataset["content"] = dataset["content"].str.replace("\n", " ", regex = False)
    dataset["content"] = dataset["content"].str.strip()

In [5]:
def correct_quotations(dataset):
    dataset = dataset.str.replace("’", "'", regex = False)
    dataset = dataset.str.replace("‘", "'", regex = False)
    dataset = dataset.str.replace("\"", "'", regex = False)
    dataset = dataset.str.replace("”", "'", regex = False)
    dataset = dataset.str.replace("“", "'", regex = False)
    dataset = dataset.str.replace("`", "'", regex = False)
    
    return dataset

In [6]:
def expand_contractions(dataset):
    dataset["content"] = correct_quotations(dataset["content"])
    
    url = "https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions"
    
    contractions_df = pd.read_html(url)[1]
    contractions_df["Contraction"] = contractions_df["Contraction"].str.replace("(informal)", "", regex = False)
    contractions_df["Contraction"] = contractions_df["Contraction"].str.replace("(formal)", "", regex = False)
    contractions_df["Contraction"] = contractions_df["Contraction"].str.strip()
    contractions_df["Full Form"] = [re.search(r"(.*?)[\(,\[]", entry[0]).group()[:-1].strip() 
                                    if re.search(r"(.*?)[\(,\[]", entry[0]) else entry[0].strip()
                                    for row, entry in enumerate(contractions_df["Full Form"].str.split("/"))]
    
    contractions_df.at[3, "Contraction"] = "n"
    contractions_df.at[4, "Full Form"] = "are not you"
    contractions_df.at[22, "Full Form"] = "do not know"
    contractions_df.at[62, "Full Form"] = "is not it"
    contractions_df.at[63, "Full Form"] = "i do not"
    contractions_df.at[68, "Full Form"] = "it is"
    contractions_df.at[69, "Full Form"] = "i do not know"
    contractions_df.at[132, "Full Form"] = "we all"
    contractions_df.at[181, "Full Form"] = "yes madam"
    
    contractions_df["Contraction"] = correct_quotations(contractions_df["Contraction"])
    contractions_df["Full Form"] = correct_quotations(contractions_df["Full Form"])
    
    contractions = dict(zip(contractions_df["Contraction"].str.lower(), 
                            contractions_df["Full Form"].str.lower()))
    
    dataset["content"] = dataset["content"].apply(lambda s : [contractions[w] if w in contractions.keys() 
                                                              else w for w in s.split()]).str.join(" ")

In [7]:
def convert_emojis_to_text(dataset):
    for index, review in enumerate(dataset["content"]):
        formatted_review = ""
        for character in review:
            if emoji.is_emoji(character):
                formatted_review += " "
            formatted_review += character
        dataset.at[index, "content"] = formatted_review.strip()
    
    dataset["content"] = dataset["content"].apply(lambda s : emoji.demojize(s))

In [8]:
# Special characters across all datasets
# {']', '*', '@', '%', '!', '/', ';', '&', "'", '$', '#', '~', '^', '}', '`', 
#  '\\', '.', '[', '(', '|', '<', ')', '+', '>', '=', ':', '{', '?', '-', ','}

def filter_out_special_characters(dataset):
    special_characters = set()
    for review in dataset["content"]:
        special_characters.update(re.findall(r"[^\w\s]", review)) 
    for character in special_characters:
        print(character + " has {} occurrences".format(dataset["content"].apply(lambda s : 
                                                                                s.count(character)).sum()))
         
    dataset["content"] = dataset["content"].str.replace(r"[*#~^\\|<>=]", " ", regex = True)
    dataset["content"] = dataset["content"].str.replace("&", " and ", regex = False)
    dataset["content"] = dataset["content"].str.replace("+", " plus ", regex = False)
    dataset["content"] = dataset["content"].str.replace("@", " at ", regex = False)
    dataset["content"] = dataset["content"].str.replace("/", " or ", regex = False)
    dataset["content"] = dataset["content"].str.replace("%", " percent ", regex = False)
    dataset["content"] = dataset["content"].str.replace("$", " dollar ", regex = False)

In [9]:
def keep_english_reviews(dataset, dataset_name):
    language_model_path = "models/lid.176.bin"
    language_model = fasttext.load_model(language_model_path)
    
    non_english_reviews = []
    for index, review in enumerate(dataset["content"]):
        if len(review.split()) >= 5 and language_model.predict(review, k = 1)[0][0] != "__label__en":
            non_english_reviews.append(index)
        
    print("List of non-english reviews: " + str(non_english_reviews) + "\n")
    dataset.drop(non_english_reviews, axis = 0, inplace = True)
    dataset.reset_index(drop = True, inplace = True)
    
    dataset.to_csv("current_datasets/english/" + dataset_name)

In [10]:
def merge_title(dataset):
    for index, row in dataset.iterrows():
        if (row["title"][-1] == "." or 
            row["title"][-1] == "?" or
            row["title"][-1] == "!"):
            dataset.at[index, "content"] = row["title"] + " " + row["content"]
        else:
            dataset.at[index, "content"] = row["title"] + ". " + row["content"]

In [11]:
def expand_abbreviation(word):
    abbreviations = {}
    with open("current_datasets/spell/abbreviations.txt", "r") as file:
        abbreviations = json.loads(file.read())
        
    return (True, abbreviations[word]) if word in abbreviations else (False, word)

In [12]:
def spell_check_reviews(dataset, name):
    spell = SpellChecker(distance = 2)
    
    def collect_word_information(time):
        cleaned_reviews = dataset["content"].str.replace(r"[^a-zA-Z0-9\s]", " ", regex = True)
        overall_word_frequency = cleaned_reviews.str.split(expand = True).stack().value_counts()
        overall_word_frequency = pd.DataFrame({"Word" : list(overall_word_frequency.index.values),
                                               "Posed Correction" : [spell.correction(w) for w 
                                                                     in overall_word_frequency.index],
                                               "Frequency" : list(overall_word_frequency.values)})
        if time == "before":
            overall_word_frequency.to_csv("current_datasets/spell/composite/" + name)
        
        unknown_words = []
        for _, row in overall_word_frequency.iterrows():
            if (len(row["Word"]) != 1 and row["Word"][0].isalpha()
                   and row["Word"][-1].isalpha() and spell.unknown([row["Word"]]) != set()):
                unknown_words.append({"Word" : row["Word"],
                                      "Posed Correction" : row["Posed Correction"],
                                      "Frequency" : row["Frequency"]})
        
        unknown_words = pd.DataFrame.from_records(unknown_words)
        unknown_words.to_csv("current_datasets/spell/" + time + "/" + name)
        
    collect_word_information("before")
    
    spell.word_frequency.load_text_file('current_datasets/spell/words.txt')
    
    for df_index, review in enumerate(dataset["content"]):
        parsed_word_collection = []
        for word in review.split():
            word_fragment = ""
            for character in word:
                if character.isalnum():
                    word_fragment += character
                else:
                    if word_fragment != "":
                        parsed_word_collection.append(word_fragment)
                        word_fragment = ""
                    parsed_word_collection.append(character)
                    
            if word_fragment != "":
                parsed_word_collection.append(word_fragment)
        
        for list_index, word in enumerate(parsed_word_collection):
            if (len(word) != 1 and word[0].isalpha() 
                    and word[-1].isalpha() and word.isalnum()):
                expansion = expand_abbreviation(parsed_word_collection[list_index])
                proposed_correction = (spell.correction(parsed_word_collection[list_index]) 
                                           if not expansion[0] else expansion[1])
                if proposed_correction:
                    parsed_word_collection[list_index] = proposed_correction                

        dataset.at[df_index, "content"] = " ".join(parsed_word_collection)
        
    dataset["content"] = dataset["content"].str.replace(" .", ".", regex = False)
    dataset["content"] = dataset["content"].str.replace(" ,", ",", regex = False)
    dataset["content"] = dataset["content"].str.replace(" ?", "?", regex = False)
    dataset["content"] = dataset["content"].str.replace(" )", ")", regex = False)
    dataset["content"] = dataset["content"].str.replace("( ", "(", regex = False)
    dataset["content"] = dataset["content"].str.replace(" !", "!", regex = False)
    dataset["content"] = dataset["content"].str.replace(" ;", ";", regex = False)
    dataset["content"] = dataset["content"].str.replace(" :", ":", regex = False)
    dataset["content"] = dataset["content"].str.replace(" '", "'", regex = False)
    dataset["content"] = dataset["content"].str.replace("' ", "'", regex = False)
    
    collect_word_information("after")

In [13]:
def grammar_check_reviews(dataset):
    happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
    args = TTSettings(num_beams = 5, min_length = 1)
    for index, review in enumerate(dataset["content"]):
        result = happy_tt.generate_text("grammar: " + review, args = args)

In [14]:
def tokenize_dataset(cleaned_dataset, raw_dataset, name):
    nlp = spacy.load("en_core_web_sm")
    
    tokenized_columns = list(cleaned_dataset.columns)
    tokenized_columns.extend(["sentenceNumber", "sentence", "reviewNumber"])
    tokenized_dataset = pd.DataFrame(columns = tokenized_columns)
    
    for review_number, review in enumerate(cleaned_dataset["content"]):
        senter = [t for t in nlp(review).sents]
        
        encountered = 0
        while encountered != len(senter):
            if len(senter[encountered]) != 0:
                review_metadata = raw_dataset.iloc[review_number].to_dict()
                review_metadata["reviewNumber"] = review_number + 1
                review_metadata["sentenceNumber"] = encountered + 1
                review_metadata["sentence"] = senter[encountered]
                tokenized_dataset.loc[len(tokenized_dataset.index)] = review_metadata
                
            encountered += 1

    tokenized_dataset.to_csv("current_datasets/tokenized/" + name)

In [2]:
def driver(apply_filter = None, sample = False, sample_size = 400):
    files = apply_filter if apply_filter else os.listdir("current_datasets/raw")
    
    for dataset_name in files:
        dataset = pd.read_csv("current_datasets/raw/" + dataset_name)
        dataset = dataset.astype(str)
        
        if "apple" in dataset_name:
            dataset.rename(columns = {"review" : "content",
                                      "body" : "developerBody",
                                      "id": "developerID",
                                      "modified" : "developerModified"},
                           inplace = True)
            merge_title(dataset)
        
        print("\nDataset name: " + dataset_name + "\n")
        
        standardize_case(dataset)
        remove_whitespace(dataset)
        keep_english_reviews(dataset, dataset_name)
        
        english_only_dataset = dataset.copy(deep = True)
        expand_contractions(dataset)
        spell_check_reviews(dataset, dataset_name)
        convert_emojis_to_text(dataset)
        unicode_normalization(dataset)
        filter_out_special_characters(dataset)
        tokenize_dataset(dataset, english_only_dataset, dataset_name)    
    
    if sample:
        for dataset_name in files:
            tokenized_dataset = pd.read_csv("current_datasets/tokenized/" + dataset_name)
            tokenized_dataset = tokenized_dataset.astype(str)
            
            np.random.seed(0)
            sampled_review_numbers = np.random.choice(tokenized_dataset["reviewNumber"].unique(), sample_size)
            sampled_reviews = tokenized_dataset[tokenized_dataset["reviewNumber"].isin(sampled_review_numbers)]
            sampled_reviews.reset_index(inplace = True)
            sampled_reviews.to_csv("current_datasets/sampled/" + dataset_name)

In [3]:
driver(["MayoClinic_google.csv"], True, 800)