In [1]:
# Importiere Bibliotheken
import os
import re
import numpy as np
import pandas as pd
from langdetect import detect

In [2]:
# Daten einlesen und anschauen
data_raw = pd.read_csv("data_input/allTweets.csv", index_col=0)
data_raw.head(5)
data_raw.drop('X3', axis=1, inplace=True)

In [3]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107056 entries, 1 to 107056
Data columns (total 3 columns):
X1    107055 non-null object
X2    107055 non-null object
X4    107055 non-null object
dtypes: object(3)
memory usage: 3.3+ MB


In [4]:
# Spalten benennen
data_raw.columns = ['User', 'Tweet', 'Datetime']
data_raw.head(5)

Unnamed: 0,User,Tweet,Datetime
1,@KueddeR,RT @Nebelspalter: Das gab es noch nie: Bundesk...,"May 23, 2015 at 06:05PM"
2,@silver_train,@diuuk Sicher nicht. Das #RTVG ist die falsche...,"May 23, 2015 at 06:12PM"
3,@peter_schibli,RT @rtvgja: Keine Billag-Schnüffler mehr? Ja b...,"May 23, 2015 at 06:25PM"
4,@peter_schibli,RT @rtvgja: Herr @F_Leutenegger : Distanzieren...,"May 23, 2015 at 06:25PM"
5,@bkuonen,"Jetzt wo ich weiss, dass CL, #superleague und ...","May 23, 2015 at 06:32PM"


In [5]:
# Daten begutachten
data_raw.iloc[500:510]

Unnamed: 0,User,Tweet,Datetime
501,@valabg,„Warum sollte ich 1 Bahnfahrkarte kaufen? Der ...,"October 31, 2017 at 03:17PM"
502,@MikeBould3r,@NatalieRickli Arbeitet für eine TV-Werbeagent...,"October 31, 2017 at 03:17PM"
503,@GyzTuufel,RT @peterhettich: Kollege Prof. Mark Schelker ...,"October 31, 2017 at 03:17PM"
504,@PeerStonebridge,RT @NZZ: #SRG-Mitarbeiter werfen ihren Chefs V...,"October 31, 2017 at 03:21PM"
505,@karstenpater,RT @srgwatch: Die (Un-)Logik der Fernseh-Steue...,"October 31, 2017 at 03:27PM"
506,@LahorJakrlin,@ZeitRauber @bglaettli @gebizzle ... oder sie ...,"October 31, 2017 at 03:27PM"
507,@ThomasLaeubli,@ZeitRauber @VinzenzWyss @FelixSchneuwly @thia...,"October 31, 2017 at 03:27PM"
508,@bdrtschr,RT @nachdenkend: Ein nicht unbekannter Werber ...,"October 31, 2017 at 03:35PM"
509,@attila_gaspar,"RT @MarcBuergi: Liebe Freunde, Kolleginnen und...","October 31, 2017 at 03:38PM"
510,@Kiamara91,RT @srgwatch: Die (Un-)Logik der Fernseh-Steue...,"October 31, 2017 at 03:39PM"


In [6]:
# Top-User ermitteln
data_raw.User.value_counts(dropna=False).head(20)

@sms2sms            1548
@MiniSchwiz         1487
@KarlMller13        1266
@RolfMll35367465    1185
@_macmike           1093
@morvjn              903
@HeinzLindenmann     894
@DailyTalk           807
@ProBillag           747
@tevau42             733
@person_tw           730
@LahorJakrlin        699
@scentedrebel        680
@bikejourno          633
@byebyebillag        619
@SwissLibArmy        617
@kindlimann          575
@Voegizug            558
@wahlch15            554
@NoSendeschluss      545
Name: User, dtype: int64

In [7]:
# Datensatz zu timeseries ändern
data_raw.Datetime.describe()
data_raw.Datetime = pd.to_datetime(data_raw.Datetime)
data_raw = data_raw.set_index('Datetime')
data_raw = data_raw['2017':'2018'] 

In [8]:
data_raw.head(5)

Unnamed: 0_level_0,User,Tweet
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-10-31 09:59:00,@kindlimann,Das #RTVG-Feedback sowas von ernstgenommen. #H...
2017-10-31 09:59:00,@urshafner1,RT @rolnam: ich will die nationalistische #SVP...
2017-10-31 09:59:00,@kindlimann,Das #RTVG-Feedback sowas von ernstgenommen. #H...
2017-10-31 10:04:00,@HansruediWidmer,@basilanderau Ah. Sorry. #nobillag-Befürworter...
2017-10-31 10:04:00,@sms2sms,@rico_caviezel @mikehabegger1 @SRF @srg_d (dum...


In [9]:
# Retweets markieren
check_for_retweets = lambda x: True if str(x)[:2] == 'RT' else False
data_raw['Retweet'] = data_raw['Tweet'].apply(check_for_retweets)

# Urheber von Original-Tweet ermitteln
find_tweet_orig = lambda x: str(x).split()[1][:-1] if str(x)[:2] == 'RT' else np.nan
data_raw['RT_from'] = data_raw['Tweet'].apply(find_tweet_orig)

# Tweets die keine Antworten oder Retweets sind markieren
check_orig_tweet = lambda x: True if str(x)[:2] != 'RT' and str(x)[:1] != '@' else False
data_raw['Orig_Tweet'] = data_raw['Tweet'].apply(check_orig_tweet)

# Alle Diskussionsteilnehmer (ausser Urheber von RTs) ermitteln
def find_tweet_participants(tweet):
    handle_list = re.findall(r'\@\w+', str(tweet))
    if str(tweet)[:2] == 'RT' and handle_list:
        del handle_list[0]
    return handle_list
data_raw['Tweet_Participants'] = data_raw['Tweet'].apply(find_tweet_participants)

# Vermutliche Replies markieren
check_reply = lambda x: True if str(x)[:1] == '@' else False
data_raw['Reply'] = data_raw['Tweet'].apply(check_reply)

# Links aus Tweets extrahieren
find_links = lambda x: re.findall(r"http\S+", x)
data_raw['Link'] = data_raw['Tweet'].apply(find_links)
data_raw['Tweet'] = data_raw['Tweet'].apply(lambda x: re.sub(r"http\S+", '', x))

# Reinen Text aus Tweet etrahieren
clean_tweet = lambda x: ' '.join([word for word in str(x).split() if word != 'RT' and bool(re.match('\@', word)) != True])
data_raw['Text'] = data_raw['Tweet'].apply(clean_tweet)

# Tags aus Tweet etrahieren
find_tweet_tags = lambda x: [tweet[1:] for tweet in re.findall(r'\#\w+', str(x))]
data_raw['Tags'] = data_raw['Tweet'].apply(find_tweet_tags)

In [None]:
# Sprache der Tweets ermitteln - Achtung: dauert lange
def detect_language(text, tags):
        try:
            return detect(text + ' '.join(tags))
        except:
            print("Language cannot be recognised.")
            return False
data_raw['Lang'] = data_raw.apply(lambda row: detect_language(row['Text'], row['Tags']), axis=1)

In [None]:
# Top RT ermitteln
data_raw.RT_from.value_counts(dropna=False).head(10)

In [None]:
# Kodierte Sprachen ermitteln
data_raw.Lang.value_counts(dropna=False)

In [None]:
# Daten zwischenspeichern
data_raw.to_pickle('data_modified/tweets.pkl')