In [61]:
import pandas as pd
import numpy as np

In [62]:
set_1 = pd.read_csv(r"data/normalized_tweets_refugees.csv")

In [63]:
set_2 = pd.read_csv(r"data/normalized_facebook_comments.csv", dtype={'binary_label': np.bool})

In [64]:
set_3 = pd.read_csv(r"data/polly_hatespeech.csv")

In [65]:
set_1.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469 entries, 0 to 468
Data columns (total 3 columns):
raw_text        469 non-null object
severity        469 non-null float64
binary_label    469 non-null bool
dtypes: bool(1), float64(1), object(1)
memory usage: 7.9+ KB


In [66]:
set_2.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 3 columns):
raw_text        5837 non-null object
severity        5836 non-null float64
binary_label    5836 non-null object
dtypes: float64(1), object(2)
memory usage: 136.9+ KB


In [67]:
set_3.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18769 entries, 0 to 18768
Data columns (total 3 columns):
raw_text        18769 non-null object
severity        18769 non-null int64
binary_label    18769 non-null bool
dtypes: bool(1), int64(1), object(1)
memory usage: 311.7+ KB


In [68]:
set_2.binary_label.unique()

array([True, False, nan], dtype=object)

In [69]:
set_2[set_2['binary_label'].isna()]

Unnamed: 0,raw_text,severity,binary_label
2596,Das macht Angst....,,


In [70]:
set_2 = set_2[set_2['binary_label'].notna()]
set_2 = set_2.astype({'binary_label': np.bool})
set_2.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5836 entries, 0 to 5836
Data columns (total 3 columns):
raw_text        5836 non-null object
severity        5836 non-null float64
binary_label    5836 non-null bool
dtypes: bool(1), float64(1), object(1)
memory usage: 142.5+ KB


# combining datasets
as we have normalized the different datasets before, we can combine them

In [71]:
df = pd.concat([set_1, set_2, set_3])
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25074 entries, 0 to 18768
Data columns (total 3 columns):
raw_text        25074 non-null object
severity        25074 non-null float64
binary_label    25074 non-null bool
dtypes: bool(1), float64(1), object(1)
memory usage: 612.2+ KB


In [72]:
print("There are {} rows, {} are hate speech, {} not.".format(len(df), len(df[df['binary_label'] == True]), len(df[df['binary_label'] == False])))

There are 25074 rows, 19597 are hate speech, 5477 not.


seems about right

In [73]:
df[df['binary_label'] == True].head(50)

Unnamed: 0,raw_text,severity,binary_label
0,#Rosenmontag ist abgesagt. #Rapefugees also wi...,0.5,True
3,Den verfluchten #Rapefugees den Krieg erklären...,0.5,True
4,War das Wochenende im Ruhrpott unterwegs. Über...,1.0,True
9,"Offenes, freies WLAN im #Sophienhof #Kiel. Mit...",0.5,True
11,Regeln und Gesetze der Einheimischen sind ein ...,0.5,True
12,"#Blutrache, #Zwangsbekehrung, #Scharia, #Kinde...",0.5,True
14,Claudia #Roth: Wer Sex-Attacken von Asylanten ...,0.5,True
16,Die Unterbringung im Gefängnis nur weil man Ve...,0.5,True
24,"#illner. erst hieß es, es kämen nur top Arbeit...",1.0,True
30,Pöbelnde #Asylanten zeigen Kopf-ab-Gesten und ...,0.5,True


# cut off no hate samples in order to have a better balanced training set

In [74]:
hate_df = df[df['binary_label'] == True]
no_hate_df = df[df['binary_label'] == False]
print("hate: {} / no hate: {}".format(len(hate_df), len(no_hate_df)))

hate: 19597 / no hate: 5477


In [76]:
import numpy as np
chosen_idx = np.random.choice(len(hate_df), size=len(no_hate_df))
hate_df = hate_df.iloc[chosen_idx]
print(hate_df.count())
hate_df.head()

raw_text        5477
severity        5477
binary_label    5477
dtype: int64


Unnamed: 0,raw_text,severity,binary_label
6460,Das ist das Zeichen\r\ntränke dich tief\r\nhtt...,1.0,True
11246,": ""In seiner Verzweiflung dachte E. sogar dar...",1.0,True
9418,": ""Joana Cotar: ein flammendes Plädoyer gegen...",1.0,True
15069,: #München: Sozialarbeiter dürfen nicht mehr ...,1.0,True
13034,Die vergisst mal wieder Ross und Reiter zu ne...,1.0,True


In [77]:
df = no_hate_df.append(hate_df)
print(df.count())
df.head()

raw_text        10954
severity        10954
binary_label    10954
dtype: int64


Unnamed: 0,raw_text,severity,binary_label
1,bitte nicht die #Türkei zum #EU-Mitglied mache...,0.0,False
2,Wieso bekommen #rapefugees mehr als unsere Har...,0.0,False
5,#Asylanteninvasion Wenn es auf unseren Straßen...,0.0,False
6,745 Millionen Menschen leben in #Europa. Ca. 4...,0.0,False
7,Tja die #SPD will unsere Steuergelder für Flüc...,0.0,False


# Export Training Set

In [78]:
df.to_csv('export/combined_iwg_fb-tw_polly.csv', index=False)
df.to_excel('export/combined_iwg_fb-tw_polly.xlsx', index=False)

%20Jeder%20kann%20erahnen,%20was%20das%20in%20Kürze%20werden%20wird!
%20Und%20wir%20haben%20tausend%20mit%20%22dicken%20Eiern%22%20hier.%20Es%20wird%20hoch%20an%20der%20Zeit,%20daß%20die%20%22Refutschies%20Wellkam%22-Genderweiber%20nicht%20nur%20Matratzen%20und%20Feldbetten%20spenden,%20sondern%20sich%20gleich%20drauflegen...' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
Sauerborn%20verlässt%20nach%2036%20Jahren%20die%20Grünen%20/%20Als%20Gründe%20nennt%20er%20„Kritik%20nicht%20nur%20an%20der%20Flüchtlingspolitik,%20die%20Grüne%20moralische%20Überheblichkeit%20und%20unerträgliche%20Intoleranz“' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
  force_unicode(url))
  force_unicode(url))
;)
Die%20Diktatur%20in%20Berlin%20hat%20jetzt%20auch%20noch%20den%20Rest%20ihres%20Verstandes%20verloren%20und%20will%20die%20Vernichtung%20Deutschlands%20jetzt%20im%20schnellverfahre