In [1]:
import requests
from bs4 import BeautifulSoup
import logging
import random
import multiprocessing
import math
import pandas as pd
import os
import sys
from time import perf_counter
from logging import info, error, warning
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait
import time

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

info(f"Core count: {multiprocessing.cpu_count()}.")

base_url = "https://translate.google.com/m"

INFO:root:Core count: 96.


In [8]:
user_agents = [
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36"
]


def get_random_useragent():
    """
    This function returns a random user agent from the user_agents list.
    @return: a user agent from the user_agents list.
    """
    index = random.randint(0, len(user_agents)-1)
    user_agent = user_agents[index]
    return user_agent


def gtranslate(text):
    """
    This function translates a text using the Google translate API.
    """
    user_agent = get_random_useragent()
    response = requests.get(base_url,
                            params={"q": text},
                            headers={"user-agent": user_agent})

    if response.status_code != 200:
        if response.status_code == 429:
            error(f"To many requests {response.status_code}.")
            info("Script encountered status 429: to requests. Waitin for 30 minutes to resume")
            time.sleep(1800)
            info("Script resumes after 30 minutes")
            return gtranslate(text)
        else:
            error(f"Received status: {response.status_code}.")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    element = soup.find("div", {"class": "t0"})

    if not element:
        element = soup.find("div", {"class": "result-container"})

    translated = None
    if element:
        translated = element.get_text()
    return translated


def batch_gtranslate(texts):
    """
    This function runs gtranslate in batch mode.

    @param: texts: list of texts that need to be translated
    @return: list of translated texts.
    """
    translated_texts = []
    start = perf_counter()
    for text in texts:
        translated_text = gtranslate(text)
        translated_texts.append(translated_text)

    elapsed_time = perf_counter()-start
    info(f"batch_translate: text_count: {len(texts)}, took: {elapsed_time:.2f} seconds.")
    return translated_texts


def para_gtranslate(texts, min_batch_size=6):
    """
    This function batch translates texts in parallel.

    @param: texts: list of texts to translate
    @param: batch_size: batch size of texts processed per thread. Default is 6.
    """
    cpu_count = multiprocessing.cpu_count()
    thread_count = cpu_count*2
    info(f"Running parallel gtranslate with cpu count: {cpu_count}, and threads: {thread_count} available.")
    batch_size = min_batch_size
    if len(texts)/thread_count <= min_batch_size:
        # Number of threads is greater then number of texts to translate
        # Example 32 threads and 14 texts, then three threads will spawned.
        # Two threads will processes 6 texts and 1 will processes 2 texts.
        batch_count = math.ceil(len(texts)/batch_size)

    else:
        # Texts is greater then number threads.
        # Example: 256 texts and 32 threads, each thread will processes 
        # 8 texts.
        batch_size = math.ceil(len(texts)/thread_count)
        batch_count = thread_count

    info(f"Start translation with batch_count: {batch_count}, batch_size: {batch_size}.")
    result = []
    futures = []
    with ThreadPoolExecutor(batch_count) as executor:
        for i in range(0, batch_count):
            start = i*batch_size
            end = start+batch_size
            if end > len(texts):
                end = len(texts)
            batch = texts[start:end:]
            future = executor.submit(batch_gtranslate, batch)
            futures.append(future)

    wait(futures)
    for future in futures:
        translated_texts = future.result()
        for text in translated_texts:
            result.append(text)

    return result


def translate_column(df, column_name):
    """
    This function translates a hydrated data-set.

    @param: df: this is a pandas dataframe.
    @param: column_name: name of the column inside the dataframe that needs to be translated. 
    @return: dataframe containing a new column called 'processed_text'
    """
    texts = df[column_name].tolist()

    translated_texts = para_gtranslate(texts)

    se = pd.Series(translated_texts)

    df["processed_texts"] = se.values
    return df


def translate_dataset(dirpath, overwrite_cache=True):
    """
    This function translates a dataset of hydrated tweets.
    A dataset is considered a directory containing .csv files.
    Each files is read into a pandas dataframe and then written to 
    directory called 'processed'. The processed directory is a sibling 
    of the hydrated directory.

    This function caches results so if file hydrated/A.csv and processed/A.csv
    exists then A.csv is skipped. Moreover if a the file 'B.csv' exists in the 
    hydrated directory but not in the processed directory the file is translated 
    and written to the processed directory.

    @param: dirpath: The path to the dataset. The dataset must be a directory.
    @param: overwrite_cache: Whether caching is enabled. The default value is True
    """
    if not os.path.exists(dirpath) and not os.path.isdir(dirpath):
        raise FileNotFoundError("File 'dirpath' does not exist or is not a directory.")

    parentdir = os.path.dirname(dirpath)
    # create a path to directory that is a sibling of 'dirpath' variable.
    processeddir = os.path.join(parentdir, "processed")
    # create a directory called 'processed' for the processed dataset if it does not yet exist.
    if not os.path.exists(processeddir):
        os.mkdir(processeddir)
        info(f"Created directory: {processeddir}.")

    filenames = [file for file in os.listdir(dirpath) if file.endswith(".csv")]
    for file in filenames:
        filepath = os.path.join(dirpath, file)
        processed_filepath = os.path.join(processeddir, file)
        if not overwrite_cache and os.path.exists(processed_filepath):
            info(f"File already exists in: {processed_filepath}; skipping.")
            continue

        df = pd.read_csv(filepath,
                         index_col="id",
                         usecols=["id", "full_text", "created_at"],
                         dtype={"id": "int64"},
                         parse_dates=["created_at"])
        df = translate_column(df, column_name="full_text")
        df.to_csv(processed_filepath)
        info(f"Created new processed file in: {processed_filepath}.")

    return True


def read_hydrated_csv(filename):
    df = pd.read_csv(filename,
                     index_col="id",
                     usecols=["id", "full_text", "created_at"],
                     dtype={"id": "int64"},
                     parse_dates=["created_at"])
    return df

In [3]:
def test_gtranslate():
    text = "Liever te dik in de kist dan een feestje gemist."
    gtranslate(text)


def test_batch_gtranslate():
    texts = [
        "Liever te dik in de kist dan een feestje gemist.",
        "Hallo welt",
        "Buenos dias"
    ]
    batch_gtranslate(texts)


def test_para_gtranslate():
    texts = [
        "Liever te dik in de kist dan een feestje gemist.",
        "Hallo welt",
        "Buenos dias"
    ]
    result = para_gtranslate(texts, min_batch_size=2)
    print(result)

In [56]:
test_gtranslate()

ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume


KeyboardInterrupt: 

In [77]:
test_para_gtranslate()

['Better to be too fat in the coffin than to miss a party.', 'Hello World', 'good morning']


In [51]:
def ratelimit_gtranslate(df,
                         rate_limit=20000,
                         lowerbound_wait=900,
                         upperbound_wait=2700):
    """
    Rate limited translate function. Translates a hydrated Dataframe with a
    certain rate limit in mind. Once the limit is reached the script
    waits for a certain amount of time before continueing; between
    lowerbound_wait and upperbound_wait.\n\n
    @param: df: Dataframe of a hydrated dataset; it is assumed it contains
                column: full_text\n.
    @param: rate_limit: the amount of rows that are processed before the wait
                        time kicks-in.
    @param: lowerbound_wait: minimum wait time in seconds;
                             default lowerbound_wait=900.
    @param: upperbound_wait: maximum wait time in seconds;
                             default lowerbound_wait=2700.
    @return: Dataframe containing a new column named 'processed_text'.
    """
    batch_count = math.ceil(len(df)/rate_limit)

    result_df = None

    request_count = 0

    for i in range(0, batch_count):
        start = i*rate_limit
        end = start+rate_limit
        if end > len(df):
            end = len(df)

        batch = df.iloc[start:end]
        texts = batch.full_text.values

        if result_df is None:
            translated_texts = para_gtranslate(texts)
            batch.loc[:, ["processed_text"]] = translated_texts
            result_df = batch
        else:
            translated_texts = para_gtranslate(texts)
            batch.loc[:, ["processed_text"]] = translated_texts
            result_df.append(batch)

        request_count = request_count+len(batch)

        if request_count >= rate_limit:
            wait_time = random.randint(lowerbound_wait, upperbound_wait)
            info(f"""Rate limit exceeded, current request count is
            {request_count}; Sleep for {wait_time} seconds.""")
            time.sleep(wait_time)
            info(f"Script resumes after {wait_time} seconds of sleep.")
            request_count = 0

    return result_df

In [55]:
dataset_path = os.path.join("data-sets",
                            "Lopez1")

filename = "output2020_03.csv"
inpath = os.path.join(dataset_path, "hydrated", filename)
outpath = os.path.join(dataset_path, "processed", filename)

df = read_hydrated_csv(inpath)

len(df)

32982

In [53]:
dataset_path = os.path.join("data-sets",
                            "Lopez1")

filename = "output2020_03.csv"
inpath = os.path.join(dataset_path, "hydrated", filename)
outpath = os.path.join(dataset_path, "processed", filename)

df = read_hydrated_csv(inpath)

df = ratelimit_gtranslate(df,
                          rate_limit=10000,
                          lowerbound_wait=1800,
                          upperbound_wait=2700)

df.to_csv(outpath)

INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 available.
INFO:root:Start translation with batch_count: 192, batch_size: 53.
INFO:root:batch_translate: text_count: 0, took: 0.00 seconds.
INFO:root:batch_translate: text_count: 0, took: 0.00 seconds.
INFO:root:batch_translate: text_count: 0, took: 0.00 seconds.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.




KeyboardInterrupt



In [46]:
dataset_path = os.path.join("data-sets",
                            "examples",
                            "hydrated",
                            "output2020_02_lg.csv")

rate_limit = 20000

df = read_hydrated_csv(dataset_path)
batch_count = math.ceil(len(df)/rate_limit)
batch_count


df.iloc[50:200]

Unnamed: 0_level_0,created_at,full_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1233148563468824576,2020-02-27 21:55:02+00:00,Kijk als wij dan toch een corona-geval in Nede...
1233360726816886784,2020-02-28 11:58:06+00:00,@rivm beste RIVM mooie berichten dat corona pa...
1233685087184674816,2020-02-29 09:27:00+00:00,Aangenomen dat dit waar is. Je gaat als land t...
1224241680116867072,2020-02-03 08:02:16+00:00,"""China wilde liever geheimhouding dan paniek d..."
1224275850411020288,2020-02-03 10:18:03+00:00,@RadioCitizenFM @ommydimpoz @sellyamutabi @mza...
...,...,...
1229647247408680960,2020-02-18 06:02:04+00:00,Oud-Kwagga en gesin in George uit China https:...
1229785382335721472,2020-02-18 15:10:58+00:00,😂 Het hindoeteken Ohm gebruiken als iets islam...
1230940516465799168,2020-02-21 19:41:03+00:00,Italië: Aantal besmettingen met coronavirus lo...
1230965415829962752,2020-02-21 21:20:00+00:00,Dus Ook Nederlanders komen gewoon terug naar h...


In [47]:
df = df.iloc[0:6]
len(df)

6

In [48]:
texts = ["a", "b", "c", "d", "e", "f"]
idx = ["1223399059123208192", "1223881392607715328"]

df.loc[:, ["blaat"]] = texts
df

Unnamed: 0_level_0,created_at,full_text,blaat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1223399059123208192,2020-02-01 00:14:00+00:00,Wereldwijd groeien de zorgen om het #coronavir...,a
1223881392607715328,2020-02-02 08:10:37+00:00,Of de aantallen kloppen niet. Of dit filmpje i...,b
1223995832548188160,2020-02-02 15:45:21+00:00,Nederlanders uit Wuhan naar vliegbasis Eindhov...,c
1224068845381607424,2020-02-02 20:35:29+00:00,triest dat hen die vrijdden ook vaak een virus...,d
1224112960953516032,2020-02-02 23:30:47+00:00,Een Deep State laboratorium medewerker besmet ...,e
1224220383651516416,2020-02-03 06:37:39+00:00,@aguiarjuanma @todonoticias Todos los aviones ...,f


In [None]:
rate

In [28]:
filename = "output2020_03.csv"

input_path = os.path.join("data-sets",
                          "Lopez1",
                          "hydrated",
                          filename)
df = pd.read_csv(input_path,
                 index_col="id",
                 usecols=["id", "full_text", "created_at"],
                 dtype={"id": "int64"},
                 parse_dates=["created_at"])

df = translate_column(df, column_name="full_text")

output_path = os.path.join("data-sets",
                           "Lopez1",
                           "processed",
                           filename)

df.to_csv(output_path)

INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 available.
INFO:root:Start translation with batch_count: 192, batch_size: 172.
ERROR:root:Received status: 500.
ERROR:root:Received status: 500.
ERROR:root:Received status: 500.
ERROR:root:Received status: 500.
ERROR:root:Received status: 500.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1111/4031577681.py", line 13, in <module>
    df = translate_column(df, column_name="full_text")
  File "/tmp/ipykernel_1111/701262450.py", line 128, in translate_column
    translated_texts = para_gtranslate(texts)
  File "/tmp/ipykernel_1111/701262450.py", line 107, in para_gtranslate
    futures.append(future)
  File "/usr/lib/python3.8/concurrent/futur

TypeError: object of type 'NoneType' has no len()

In [3]:
translate_dataset("data-sets/Lopez1/hydrated", overwrite_cache=False)

INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 available.
INFO:root:Start translation with batch_count: 192, batch_size: 844.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests

KeyboardInterrupt: 

INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 minutes
INFO:root:Script resumes after 30 

In [40]:
filepath = os.path.join("data-sets",
                        "examples",
                        "hydrated",
                        "output2020_02_sm.csv")
df = pd.read_csv(filepath)

print(len(df))

texts = df.full_text.tolist()


start = perf_counter()
translated_texts = para_gtranslate(texts)
elapsed_time = perf_counter()-start
print(f"Batch translation took: {elapsed_time:.2f} seconds.")
df["processed_text"] = pd.Series(translated_texts).values
processed_filepath = os.path.join("data-sets",
                                  "examples",
                                  "processed",
                                  "TEST__output2020_02_sm.csv")

df.to_csv(processed_filepath)

12
INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 available.
INFO:root:Start translation with batch_count: 2, batch_size: 6.
INFO:root:batch_translate: text_count: 6, took: 0.74 seconds.
INFO:root:batch_translate: text_count: 6, took: 0.77 seconds.
Batch translation took: 0.77 seconds.


In [127]:
filepath = os.path.join("data-sets",
                        "examples",
                        "hydrated",
                        "output2020_02_lg.csv")
df = pd.read_csv(filepath)

texts = df.full_text[0:50].tolist()

start = perf_counter()
translated_texts = batch_gtranslate(texts)
elapsed_time = perf_counter()-start
print(f"Batch translation took: {elapsed_time:.2f} seconds.")

INFO:root:batch_translate: text_count: 50, took: 5.45 seconds.
Batch translation took: 5.46 seconds.


In [49]:
df = pd.read_csv("data-sets/examples/hydrated/output2020_02_sm.csv")



Unnamed: 0,created_at,id,id_str,full_text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,matching_rules,current_user_retweet,scopes,withheld_copyright,withheld_in_countries,withheld_scope,geo,contributors,display_text_range,quoted_status_permalink
0,Sat Feb 01 00:14:00 +0000 2020,1223399059123208192,1223399059123208192,Wereldwijd groeien de zorgen om het #coronavir...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,,"[0, 155]",


In [70]:
df = pd.read_csv("data-sets/examples/hydrated/output2020_02_sm.csv")
id_series = pd.to_numeric(df.id)
ft_series = df.full_text
ca_series = pd.to_datetime(df.created_at)

df = pd.DataFrame()
df["id"] = id_series
df["full_text"] = ft_series
df["created_at"] = ca_series
df = df.set_index("id")

df = df.sort_values("created_at")
df

for idx, row in df.iterrows():
    gtra

Wereldwijd groeien de zorgen om het #coronavirus. Ruim 11.000 mensen zijn inmiddels besmet in China, van wie er 258 zijn overleden. https://t.co/cO0BUXdi9z
Of de aantallen kloppen niet. Of dit filmpje is niet echt. Of er is iets megalomaans aan de gang... gewoon iéts.

#coronavirus https://t.co/Ed2ln2VQgI
Nederlanders uit Wuhan naar vliegbasis Eindhoven #Capelle https://t.co/fG2o3bL2y0
triest dat hen die vrijdden ook vaak een virus verspreiden ...eeh verspreden...Ik versprook me 
#spelenmettaal
Een Deep State laboratorium medewerker besmet met biologische wapen heeft een kip met hiv geneukt op de Wuhan dierenmarkt 👽👽👽
@aguiarjuanma @todonoticias Todos los aviones tienen motores...
? Erster Coronavirus-Toter in Hongkong – Pfleger streiken #projects #business #jobs #news #community https://t.co/5O3kIqyAe2
Lees "Doorgaan Grote Prijs van China in Formule 1 zeer onzeker" op Nieuwsblik - https://t.co/60z9rObXtn - Lees verder bij de bron van het artikel…
F1-directeur verwacht dat coronavirus 

In [86]:


df.sort_values("created_at")


dtype('O')

In [41]:
dirpath = os.path.join("data-sets", "examples", "hydrated")
translate_dataset(dirpath=dirpath, overwrite_cache=False)

INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 available.
INFO:root:Start translation with batch_count: 2, batch_size: 6.
INFO:root:batch_translate: text_count: 6, took: 0.69 seconds.
INFO:root:batch_translate: text_count: 6, took: 0.70 seconds.
INFO:root:Created new processed file in: data-sets/examples/processed/output2020_02_sm.csv.
INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 available.
INFO:root:Start translation with batch_count: 192, batch_size: 110.


  translate_dataset(dirpath=dirpath, overwrite_cache=False)


ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requ

True

In [42]:
nums = ["Hallo", "wereld", "dit", "is", "een", "test"]

se = pd.Series(nums)

df = pd.DataFrame()

df["nums"] = se.values
df

0    []
1    []
dtype: object