In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import pandas as pd
import sys

sys.path.insert(0, ".")

In [3]:
from utils import load_json, save_json, BASE_DATA_DIR
from twitter_requests import TwitterApi

In [4]:
RANDOM_SEED = 42

# Load data

In [5]:
twitter_api = TwitterApi(timeline_params_path="timeline_params.json")

In [6]:
candidates_df = pd.read_csv(os.path.join(BASE_DATA_DIR, "twitter_data/candidates_20210721.csv"))

In [7]:
# candidates_df = pd.concat([candidates_df, pd.DataFrame([candidate])], axis=0).reset_index(drop=True)
# candidates_df.to_csv(os.path.join(BASE_DATA_DIR, "twitter_data/candidates_20210721.csv"), index=False)

In [8]:
tweet_folder_path = os.path.join(BASE_DATA_DIR, "twitter_data/raw_tweets")

In [9]:
for i, row in candidates_df.iterrows():
    print(row.full_name)
    twitter_api.build_user_dataset(row.twitter_name, data_dir=tweet_folder_path)
    print()

Alice Weidel
0 tweets queried and stored.

Tino Chrupalla
0 tweets queried and stored.

Armin Laschet
0 tweets queried and stored.

Markus Söder
0 tweets queried and stored.

Christian Lindner
0 tweets queried and stored.

Nicola Beer
0 tweets queried and stored.

Katja Suding
0 tweets queried and stored.

Michael Kellner
5 tweets queried and stored.

Ricarda Lang
13 tweets queried and stored.

Annalena Baerbock
6 tweets queried and stored.

Janine Wissler
1 tweets queried and stored.

Susanne Hennig-Wellsow
1 tweets queried and stored.

Amira Mohamed Ali
2 tweets queried and stored.

Saskia Esken
37 tweets queried and stored.

Norbert Walter-Borjans
9 tweets queried and stored.

Olaf Scholz
5 tweets queried and stored.

Julia Klöckner
5 tweets queried and stored.

Friedrich Merz
6 tweets queried and stored.



# Create dataset

In [10]:
from glob import glob

In [11]:
twitter_files = glob(f"{tweet_folder_path}/*.json")

In [12]:
twitter_data = []

for filepath in twitter_files:
    twitter_data.extend(load_json(filepath))

In [13]:
twitter_df = pd.DataFrame(twitter_data)

In [14]:
twitter_df["author_id"] = twitter_df.author_id.map(str)

In [15]:
candidates_df["author_id"] = candidates_df.twitter_name.map(lambda x: str(twitter_api.query_user_data_by_name(x)["id"]))

In [16]:
twitter_df = pd.merge(
    left=twitter_df,
    right=candidates_df,
    on="author_id",
    how="inner"
)

# Clean tweets

In [17]:
from tweet_preprocessing import normalize_mentions, normalize_tokens 

## Normalize user names

In [18]:
user_name_mapping_path = os.path.join(BASE_DATA_DIR, "twitter_data/user_name_mapping.json")

In [19]:
import json

In [20]:
with open(user_name_mapping_path, "r") as f:
    mapping = json.load(f)

In [21]:
normalize_mentions(twitter_df, twitter_api, user_name_mapping_path)

Error with mention 'dna4287'.


## Normalize tokens

In [22]:
token_mapping = {
    "#": "",
    "BuReg": "Bundesregierung",
    "🇩🇪": "Deutschland",
    "&amp;": "und",
    r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))": ""
}

In [23]:
twitter_df["cleaned_text"] = twitter_df.cleaned_text.map(lambda x: normalize_tokens(x, token_mapping))

In [24]:
html_artefact_mapping = {
    "&amp;": "&"
}

In [25]:
twitter_df["text"] = twitter_df.text.map(lambda x: normalize_tokens(x, html_artefact_mapping))

# Normalize party

In [26]:
twitter_df["party"] = twitter_df.party.map(lambda x: "CDU/CSU" if x in ["CDU", "CSU"] else x)

# Store data

In [27]:
from datetime import datetime

In [28]:
current_date = datetime.now().strftime("%Y%m%d")
file_name = f"twitter_df_{current_date}.pkl"

In [29]:
twitter_df.to_pickle(os.path.join(BASE_DATA_DIR, f"twitter_data/processed_datasets/{file_name}"))

In [None]:
# twitter_df = pd.read_pickle(os.path.join(BASE_DATA_DIR, f"twitter_data/processed_datasets/{file_name}"))