In [30]:
import csv
import pandas as pd

df = pd.read_csv('../corpora/de/clips.tsv', 
        sep="\t",
        parse_dates=False,
        engine="python",
        encoding="utf-8",
        error_bad_lines=False,
        quotechar='"',
        quoting=csv.QUOTE_NONE,)

df_de = df[df['locale'] == "de"]
df_de

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,bucket
517802,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,8d31fe1fe37219527930900426bf0614eb87342f3443ab...,Sonja errötete vor Scham.,2,1,twenties,male,germany,de,train
517803,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,ce16825fb7cd3584a1769cab3d09996bf2cfd29589c8af...,Herbert öffnet den Spülkasten%2C um nachzuscha...,0,2,twenties,male,germany,de,train
517804,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,32165e67e56d18907ae8b00b04d32f3e0b6a4222462f23...,Hast du was von dem Stoff dabei%3F,2,0,twenties,male,germany,de,train
517808,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,85c260787785fd84ed0941dcc723735882c72cbc47f3bd...,Bitte teilen Sie sich einigermaßen gleichmäßig...,2,0,twenties,male,germany,de,train
517809,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,14ff3601059281b2ab8ffe4640ac56d2984ee70f9755c8...,Um sich zu merken%2C ob Steuerbord oder Backbo...,2,0,twenties,male,germany,de,train
517810,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,6983ce21323d25a96f13d9f1a840e1bccced91d31369f4...,Irgendetwas raschelt da im Gebüsch.,2,0,twenties,male,germany,de,train
517811,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,887494277b7dd22dcbe4cedbbc826e7e360005a9fc1a4a...,Ihr könnt nach Hause fahren!,2,0,twenties,male,germany,de,train
517812,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,586c390eee91025d62f6c5776ba071f0c994708a33233a...,Den lassen wir nicht vom Haken.,2,0,twenties,male,germany,de,train
517813,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,47cb7c02068bac08d95dc2d4d200ca8793b9653dc7e2bf...,Ob das wirklich Elfmeter war%2C erscheint dem ...,2,0,twenties,male,germany,de,train
517814,d3ff5946221a64195b7266d7685ca5170623d64bd383b7...,d8de3f8fb5a9e8e03ce53391a9087c89fee44bd5e6c5f5...,Von mir aus auch das.,2,0,twenties,male,germany,de,train


In [26]:
df_de.count()

client_id     133442
path          133442
sentence      133442
up_votes      133442
down_votes    133442
age           107810
gender        108317
accent        102070
locale        133442
bucket        133439
dtype: int64

In [27]:
df_de['bucket'].value_counts()

test     52584
dev      46981
train    33874
Name: bucket, dtype: int64

In [37]:
valid = df_de.loc[ lambda df: (df.up_votes + df.down_votes > 1) & (df.up_votes > df.down_votes), : ]

In [38]:
valid.count()

client_id     127997
path          127997
sentence      127997
up_votes      127997
down_votes    127997
age           103921
gender        104377
accent         98475
locale        127997
bucket        127994
dtype: int64

In [101]:
valid['sentence'].value_counts().to_frame().reset_index().describe()

Unnamed: 0,sentence
count,7412.0
mean,17.268888
std,10.792076
min,1.0
25%,16.0
50%,18.0
75%,19.0
max,80.0


In [70]:
speaker_counts = valid["client_id"].value_counts()
speaker_counts = speaker_counts.to_frame().reset_index()
speaker_counts.columns = ["client_id", "user_sentence_count"]
valid_joined = valid.join(speaker_counts.set_index("client_id"), on="client_id")

valid_joined = valid_joined.sort_values(["user_sentence_count", "client_id"], ascending=False)
valid_joined = valid_joined.groupby("sentence")
#valid_joined = valid_joined.drop(columns="user_sentence_count")

In [89]:
valid_joined.head(2)["sentence"].value_counts().to_frame().reset_index()

Unnamed: 0,index,sentence
0,"Das Produkt ist nach Gewicht abgepackt, nicht ...",2
1,Wie weit ist es von Neumünster bis nach Hamburg?,2
2,Noch genießt er sein Vertrauen.,2
3,"Falls nicht, dann sei Ihnen ein Ausflug zum Ba...",2
4,Haben Sie es schon mit Nachhilfe probiert?,2
5,Charakteristisch für die Passstraße sind die v...,2
6,Die Autokorrektur ist keine große Hilfe.,2
7,Mach dich nicht verrückt.,2
8,Mein Waffeleisen produziert Waffeln in Herzche...,2
9,"Was diesen Aspekt betrifft, solltest du deinen...",2


In [102]:
def sample_size(population_size):
    """Calculates the sample size.
    Calculates the sample size required to draw from a population size `population_size`
    with a confidence level of 99% and a margin of error of 1%.
    Args:
      population_size (int): The population size to draw from.
    """
    margin_of_error = 0.01
    fraction_picking = 0.50
    z_score = 2.58 # Corresponds to confidence level 99%
    numerator = (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2)
    denominator = 1 + (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2 * population_size)
    return numerator / denominator

def calculate_data_set_sizes(total_size):
    # Find maximum size for the training data set in accord with sample theory
    for train_size in range(total_size, 0, -1):
        calculated_sample_size = int(sample_size(train_size))
        if 2 * calculated_sample_size + train_size <= total_size:
            dev_size = calculated_sample_size
            test_size = calculated_sample_size
            break
    return train_size, dev_size, test_size

In [103]:
calculate_data_set_sizes(len(valid_joined))

(2726, 2342, 2342)

In [118]:
from urllib.parse import unquote
from html.parser import HTMLParser

class HTMLStripper(HTMLParser):
    """Class that strips HTML from strings.
    Examples:
        >>> stripper = _HTMLStripper()
        >>> stripper.feed(html)
        >>> nohtml = stripper.get_data()
    """
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

    
def strip_tags(html):
    """Removes HTML tags from passed text.
    Args:
      html (str): String containing HTML
    Returns:
      (str): String with HTML removed
    """
    s = HTMLStripper()
    s.feed(html)
    return s.get_data()

def cleanup(sentence):
    sentence = unquote(sentence)
    sentence = strip_tags(sentence)
    return sentence

In [137]:
cleanedup = valid["sentence"].apply(lambda text: cleanup(text)).drop_duplicates().reset_index().drop(columns="index")
cleanedup = cleanedup.sort_values(["sentence"], ascending=True)

In [140]:
filename = "de_sentences.tsv"
cleanedup.to_csv(filename, sep='\t', index=False)

Unnamed: 0,sentence
2378,"""""""Ah!"""", stöhnte er und schlug auf das lärmen..."
3763,"""""""Allerdings"""", erklärte er mit bedeutungsvol..."
3576,"""""""Beim Wrestling ist alles nur Show"""", behaup..."
2309,"""""""Bemerkenswert"""", staunte Kurt."""
3606,"""""""Benimm dich anständig!"""", mahnte Christiane..."
2051,"""""""Birnen habe ich leider gerade nicht da"""", b..."
711,"""""""Brauche ich heute einen Regenschirm?"""", fra..."
2768,"""""""Brauchen Sie Starthilfe?"""", bot Fritz hilfs..."
3709,"""""""Christine hat ein viel größeres Planschbeck..."
3473,"""""""Da muss ich mich vertan haben"""", grübelte K..."
