In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

plt.style.use('fivethirtyeight')

In [None]:
# globals
DATA_DIR = os.path.join(os.pardir, 'input', 'urbandictionary')

COL_NAMES = ['character', 'browsing_page_url', 'word_url', 'word', 'definition', 'sentence']

## Reading Data

In [None]:
file_paths = []
for root, dirs, files in os.walk(os.path.join(DATA_DIR, 'Urban')):
    for f in files:
        if f.endswith('.csv') and f.startswith('urban_data'):
            file_paths.append(os.path.join(root, f))

In [None]:
df_urban = pd.concat([pd.read_csv(f, names=COL_NAMES) for f in file_paths])

In [None]:
df_urban.shape

In [None]:
df_urban.reset_index(inplace=True, drop=True)
df_urban.head()

In [None]:
# all unique character are present check
print(sorted(df_urban['character'].unique()))

In [None]:
# null values check
df_nulls = df_urban[(df_urban.isnull().any(axis=1)) | (df_urban.isna().any(axis=1))]
df_nulls.shape

In [None]:
# random samples check (some may contain harsh language)
df_sample = df_urban[['word', 'definition', 'sentence']].sample(1)

for i in df_sample.values:
    i[1] = re.sub('\r', ' ', i[1])
    i[2] = re.sub('\r', ' ', i[2])
    print("Word: ", i[0])
    print("Meaning: ", i[1])
    print("Sentence: ", i[2])
    print("---"*20)

In [None]:
# drop nulls
df_urban = df_urban.drop(df_nulls.index)
df_urban.shape

## Very Simple EDA

- Number of words per character

- Length of words, meaning, sentence

- Number of characters in word, meaning, sentence

- Frequent special characters used

In [None]:
# histogram simple helper function
def plot_hist(vals, bins, title, xlabel, ylabel):
    plt.figure(figsize=(12,8))
    sns.distplot(vals, kde=False, bins=bins)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show();

#### By Word/Phrase/Slang analysis

In [None]:
df_urban['word_chars_num'] = df_urban['word'].apply(lambda x: len(x))
df_urban['word_words_num'] = df_urban['word'].apply(lambda x: len(x.split()))

In [None]:
plot_hist(df_urban.word_chars_num, bins=70,
          title='Characters length in Word/Phrase/Slang',
          xlabel='Length',
          ylabel='Count')

In [None]:
df_urban['word_chars_num'].quantile([0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999])

In [None]:
plot_hist(df_urban.word_words_num, bins=50,
          title='Length of Words in Word/Phrase/Slangs',
          xlabel='Length',
          ylabel='Count')

In [None]:
df_urban['word_words_num'].quantile([0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999])

In [None]:
# curiosity: what are some slangs with more than 3 words?
df_urban[df_urban.word_words_num > 3].sample(5)['word'].values

#### By Definition Analysis

In [None]:
df_urban['defn_chars_num'] = df_urban['definition'].apply(lambda x: len(x))
df_urban['defn_words_num'] = df_urban['definition'].apply(lambda x: len(x.split()))

In [None]:
plot_hist(df_urban.defn_chars_num, bins=80,
          title='Characters length in Definition',
          xlabel='Length',
          ylabel='Count')

In [None]:
df_urban['defn_chars_num'].quantile([0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999])

In [None]:
plot_hist(df_urban.defn_words_num, bins=80,
          title='Length of Words in Definition',
          xlabel='Length',
          ylabel='Count')

In [None]:
df_urban['defn_words_num'].quantile([0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999])

#### By Sentence Analysis

In [None]:
df_urban['sent_chars_num'] = df_urban['sentence'].apply(lambda x: len(x))
df_urban['sent_words_num'] = df_urban['sentence'].apply(lambda x: len(x.split()))

In [None]:
plot_hist(df_urban.sent_chars_num, bins=80,
          title='Characters length in Sentence',
          xlabel='Length',
          ylabel='Count')

In [None]:
df_urban['sent_chars_num'].quantile([0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999])

In [None]:
df_urban[df_urban.sent_chars_num > 1000].shape

In [None]:
plot_hist(df_urban.sent_words_num, bins=200,
          title='Length of Words in Sentence',
          xlabel='Length',
          ylabel='Count')

In [None]:
df_urban['sent_words_num'].quantile([0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999])

You can choose to remove all the words, definitions and sentences with extreme lengths.

## Cleaning

These are the cleaning steps I believe needed to be done based on the data:

- Remove any row with nulls or nans. ✅

- Replace \r with single white space in a string. ✅

- Remove trailing white spaces at the end. ✅

- Remove word / meaning / sentence that are empty strings. (None of them have empty strings) ✅

- Replace all spaces that exist before a period symbol or punctuation at the end. ✅

- Replace double or more spaces with single space. ✅

- Remove emojis. ✅

- Apply transformations based off EDA. ✅

*I had it as a script so some steps are repetitive.*

In [None]:
import os
import re
import pandas as pd
from tqdm.auto import tqdm

# set globals
DATA_DIR = os.path.join(os.pardir, 'input', 'urbandictionary')
COL_NAMES = ['character', 'browsing_page_url', 'word_url', 'word', 'definition', 'sentence']

def replace_special(string):
    """Replace special \r character from text."""
    new_str = re.sub('\r', ' ', string)
    return new_str

def replace_space_before_punct(string):
    """Remove all existing spaces before punctuation."""
    new_str = re.sub(r"\b\s+’\b", r"'", string)
    new_str = re.sub(r"\"\s\b", r'"', new_str)
    new_str = re.sub(r"\b\s+,\s*\b", r', ', new_str)
    new_str = re.sub(r'\s([?.!"](?:\s|$))', r'\1', new_str)
    return new_str

def replace_double_spaces(string):
    """Replace all more than one spaces to single space."""
    return ' '.join(string.split())

def remove_emoji(string):
    """Replace emojis from text
    Source: https://stackoverflow.com/a/49146722/330558"""

    emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002702-\U000027B0"
                          u"\U000024C2-\U0001F251"
                          "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', string)

def eda_based_cleaning(df):
    """Filter data based off observations from EDA."""

    df['word_chars_num'] = df['word'].apply(lambda x: len(x))
    df['word_words_num'] = df['word'].apply(lambda x: len(x.split()))
    df['defn_chars_num'] = df['definition'].apply(lambda x: len(x))
    df['defn_words_num'] = df['definition'].apply(lambda x: len(x.split()))
    df['sent_chars_num'] = df['sentence'].apply(lambda x: len(x))
    df['sent_words_num'] = df['sentence'].apply(lambda x: len(x.split()))

    df = df[~((df.word_chars_num > 17) | (df.word_words_num > 3))]
    print("After filtering based on word length: ", df.shape[0])
    df = df[~((df.defn_chars_num > 190) | (df.defn_words_num > 38))]
    print("After filtering based on definition length: ", df.shape[0])
    df = df[~((df.sent_chars_num > 155) | (df.sent_words_num > 25))]
    print("After filtering based on sentence length: ", df.shape[0])

    print("New dataframe shape: ", df.shape)

    return df

def final_clean(text):
    """Main function to apply all cleaning functions."""
    cleaned_text = replace_special(text)
    cleaned_text = replace_space_before_punct(cleaned_text)
    cleaned_text = replace_double_spaces(cleaned_text)
    cleaned_text = remove_emoji(cleaned_text)

    return cleaned_text

if __name__ == "__main__":
    print("-"*50)
    print("Loading data...")
    file_paths = []
    for root, dirs, files in os.walk(os.path.join(DATA_DIR, 'Urban')):
        for f in files:
            if f.endswith('.csv') and f.startswith('urban_data'):
                file_paths.append(os.path.join(root, f))

    df_urban = pd.concat([pd.read_csv(f, names=COL_NAMES) for f in file_paths])
    print("Data loaded.")
    print("Data shape: ", df_urban.shape)
    df_urban.reset_index(inplace=True)
    print("-"*50)

    df_nulls = df_urban[(df_urban.isnull().any(axis=1)) | (df_urban.isna().any(axis=1))]
    print("Records with at least one column null: ", df_nulls.shape[0])
    print("Dropping nulls.")
    df_urban = df_urban.drop(df_nulls.index)
    df_urban.reset_index(inplace=True)
    print("New data shape: ", df_urban.shape)
    print("-"*50)

    print("Applying transformation based off EDA.")
    df_urban_new = eda_based_cleaning(df_urban)
    print("-"*50)

    print("Cleaning texts...")
    df_urban_new['word'] = df_urban_new['word'].apply(lambda x: final_clean(x))
    df_urban_new['definition'] = df_urban_new['definition'].apply(lambda x: final_clean(x))
    df_urban_new['sentence'] = df_urban_new['sentence'].apply(lambda x: final_clean(x))
    print("Data shape: ", df_urban_new.shape)
    print("-"*50)

    print("Success!")

#### New pre-processed data samples

In [None]:
df_sample = df_urban_new[['word', 'definition', 'sentence']].sample(1)

for i in df_sample.values:
    i[1] = re.sub('\r', ' ', i[1])
    i[2] = re.sub('\r', ' ', i[2])
    print("Word: ", i[0])
    print("Meaning: ", i[1])
    print("Sentence: ", i[2])
    print("---"*20)