In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import pandas as pd
import sys

sys.path.insert(0, ".")

In [3]:
from utils import load_json, save_json, BASE_DATA_DIR
from twitter_requests import TwitterApi

In [4]:
RANDOM_SEED = 42

# Load data

In [None]:
twitter_api = TwitterApi(timeline_params_path="timeline_params.json")

In [None]:
candidates_df = pd.read_csv(os.path.join(BASE_DATA_DIR, "twitter_data/candidates_20210604.csv"))

In [None]:
tweet_folder_path = os.path.join(BASE_DATA_DIR, "twitter_data/raw_tweets")

In [None]:
for i, row in candidates_df.iterrows():
    print(row.full_name)
    twitter_api.build_user_dataset(row.twitter_name, data_dir=tweet_folder_path)
    print()

# Create dataset

In [None]:
from glob import glob

In [None]:
twitter_files = glob(f"{tweet_folder_path}/*.json")

In [None]:
twitter_data = []

for filepath in twitter_files:
    twitter_data.extend(load_json(filepath))

In [None]:
twitter_df = pd.DataFrame(twitter_data)

In [None]:
twitter_df["author_id"] = twitter_df.author_id.map(str)

In [None]:
candidates_df["author_id"] = candidates_df.twitter_name.map(lambda x: str(twitter_api.query_user_data_by_name(x)["id"]))

In [None]:
twitter_df = pd.merge(
    left=twitter_df,
    right=candidates_df,
    on="author_id",
    how="inner"
)

# Clean tweets

In [None]:
from tweet_preprocessing import normalize_mentions, normalize_tokens 

## Normalize user names

In [None]:
user_name_mapping_path = os.path.join(BASE_DATA_DIR, "twitter_data/user_name_mapping.json")

In [None]:
normalize_mentions(twitter_df, twitter_api, user_name_mapping_path)

## Normalize tokens

In [None]:
token_mapping = {
    "BuReg": "Bundesregierung",
    "🇩🇪": "Deutschland",
    "&amp;": "und",
    "#": "",
    r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))": ""
}

In [None]:
twitter_df["cleaned_text"] = twitter_df.cleaned_text.map(lambda x: normalize_tokens(x, token_mapping))

# Store data

In [None]:
twitter_df.to_pickle(os.path.join(BASE_DATA_DIR, "twitter_data/processed_datasets/twitter_df_20210604.pkl"))

In [5]:
twitter_df = pd.read_pickle(os.path.join(BASE_DATA_DIR, "twitter_data/processed_datasets/twitter_df_20210604.pkl"))

# Upload to Elastic Cloud

In [None]:
from haystack_wrappers.haystack_elasticsearch_fix import 

In [5]:
import csv
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from elasticsearch import Elasticsearch

06/04/2021 12:23:01 - INFO - faiss.loader -   Loading faiss with AVX2 support.
06/04/2021 12:23:01 - INFO - faiss.loader -   Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
06/04/2021 12:23:01 - INFO - faiss.loader -   Loading faiss.
06/04/2021 12:23:01 - INFO - faiss.loader -   Successfully loaded faiss.
06/04/2021 12:23:08 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [6]:
host = 'https://a5ca101c4ed9428faea3b97e4e56bf91.us-west1.gcp.cloud.es.io'
port = 9243
credentials_path = "credentials-65d862-2021-May-07--21_21_57.csv"

In [7]:
with open(credentials_path, "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        es_username = row["username"].strip()
        es_password = row["password "]

In [10]:
client = Elasticsearch(
                hosts=[host],
                port=port,
                http_auth=(es_username, es_password),
                scheme="http",
            )

In [None]:
es = ElsticSearch

In [None]:
document_store = ElasticsearchDocumentStoreFixed(host=host, port=port, username=es_username, password=es_password,
                                                 scheme="https")