# Import german language data 

In [None]:
import csv
import pandas as pd

# disable truncated columns
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)

# use full siza of display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# import data
df = pd.read_csv('../corpora/de/clips.tsv', 
        sep="\t",
        parse_dates=False,
        engine="python",
        encoding="utf-8",
        error_bad_lines=False,
        quotechar='"',
        quoting=csv.QUOTE_NONE,)

# filter to german language
df_de = df[df['locale'] == "de"]

# only retain those datasets with at least 2 up or down-votes and more up than down-votes (aka "valid data")
valid = df_de.loc[ lambda df: (df.up_votes + df.down_votes > 1) & (df.up_votes > df.down_votes), : ]

# Apply current 'common' preprocessors

In [None]:
import unicodedata

from urllib.parse import unquote
from html.parser import HTMLParser


class _HTMLStripper(HTMLParser):
    """Class that strips HTML from strings.

    Examples:
        >>> stripper = _HTMLStripper()
        >>> stripper.feed(html)
        >>> nohtml = stripper.get_data()
    """

    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return "".join(self.fed)


def _strip_tags(html):
    """Removes HTML tags from passed text.

    Args:
      html (str): String containing HTML

    Returns:
      (str): String with HTML removed
    """
    s = _HTMLStripper()
    s.feed(html)
    return s.get_data()


def _strip_string(sentence):
    """Cleans a string based on a whitelist of printable unicode categories.

    You can find a full list of categories here:
    http://www.fileformat.info/info/unicode/category/index.htm
    """
    letters     = ('LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu')
    numbers     = ('Nd', 'Nl', 'No')
    marks       = ('Mc', 'Me', 'Mn')
    punctuation = ('Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps')
    symbol      = ('Sc', 'Sk', 'Sm', 'So')
    space       = ('Zs',)

    allowed_categories = letters + numbers + marks + punctuation + symbol + space

    return u''.join([c for c in sentence if unicodedata.category(c) in allowed_categories])


def common(sentence):
    """Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data.

    Args:
      sentence (str): Sentence to be cleaned up.

    Returns:
      (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
    """

    # Decode any URL encoded elements of sentence
    sentence = unquote(sentence)
    # Remove any HTML tags
    sentence = _strip_tags(sentence)
    # Remove non-printable characters
    sentence = _strip_string(sentence)
    # TODO: Clean up data in a language independent manner
    return sentence


valid.loc[:, ('sentence')] = valid["sentence"].apply(func=lambda t: common(t))

# Apply current 'de' preprocessors

In [None]:
def _preprocessor_wrapper(client_id, sentence, up_votes, down_votes):
    sentence = de(client_id, sentence)
    if None == sentence or not sentence.strip():
        up_votes = 0
        down_votes = 2
    return pd.Series([sentence, up_votes, down_votes])

import re
QUOTE_PATTERN = re.compile(r'^\"{3}(.*)\"{2}(.*)\"{1}$')
QUOTE_PATTERN_2 = re.compile(r'^\"{1}(.*)\"{2}(.*)\"{2}(.*)\"{1}$')
QUOTE_PATTERN_3 = re.compile(r'^\"{1}(.*)\"{1}')
    
def _remove_multi_quotes(sentence):
    """Removes all quotes from patterns like 
    \"\"\"content\"\"content\" or
    \"content\"\"content\"\"content\" or
    \"content\"
    
    Args:
      sentence (str): Sentence to be cleaned up.
      
    Returns:
      (str): Cleaned up sentence. Returns the sentence 'as-is', if matching
      did not work as expected
    """
    matches = QUOTE_PATTERN.match(sentence) # pattern: \"\"\"content\"\"content\"
    matches2 = QUOTE_PATTERN_2.match(sentence) # pattern: \"content\"\"content\"\"content\"
    matches3 = QUOTE_PATTERN_3.match(sentence) # patter: \"content\"
    
    if matches != None and matches.lastindex == 2:
        return "{}{}".format(matches.group(1), matches.group(2))
    elif matches2 != None and matches2.lastindex == 3:
        return "{}{}{}".format(matches2.group(1), matches2.group(2), matches2.group(3))
    elif matches3 != None and matches3.lastindex == 1:
        return "{}".format(matches3.group(1))
    
    return sentence
   
                             
def de(client_id, sentence):
    """Cleans up the passed sentence, removing or reformatting invalid data.

    Args:
      client_id (str): Client ID of sentence's speaker
      sentence (str): Sentence to be cleaned up.

    Returns:
      (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
    """
    # Remove quotation mark patterns
    sentence = _remove_multi_quotes(sentence)
    
    # TODO: Clean up de data
    return sentence

test = valid.copy(deep=True)

test[["sentence", "up_votes", "down_votes"]] = test[["client_id", "sentence", "up_votes", "down_votes"]].apply(func=lambda arg: _preprocessor_wrapper(*arg), axis=1)


In [None]:

test['sentence'].drop_duplicates().sort_values(ascending=True).iloc[0:1000]