In [1]:
from tqdm import tqdm
import spacy
import pandas as pd
from spacytextblob.spacytextblob import SpacyTextBlob
import fasttext
from huggingface_hub import hf_hub_download
import numpy as np

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("spacytextblob")
tqdm.pandas()

model_path = hf_hub_download(
        repo_id="facebook/fasttext-language-identification", filename="model.bin"
    )

model = fasttext.load_model(model_path)

def get_probs(text, model):
    text = " ".join(text.split("\n"))
    langs, probs = model.predict(text, k=157)
    np.asarray(probs)
    
    if "__label__eng_Latn" in langs:
        return probs[langs.index("__label__eng_Latn")]
    return 0.0

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
for loc in ["au", "in", "uk"]:
    for dom in ["google", "reddit"]:
        for task in ["sentiment", "sarcasm"]:
            size = {}
            df = []
            for split in ["train", "valid", "test"]:
                data = pd.read_csv(f"./splits/{task.title()}/en-{loc.upper()}/{dom.title()}/{split}.csv", encoding="ascii", encoding_errors="ignore")
                size[split] = len(data)
                df.append(data)

            df = pd.concat(df).reset_index(drop = True)
            
            lang = df["text"].apply(lambda x : get_probs(x, model)).mean()

            print(f"{loc.upper()} | {dom.title()} | {task.title()} | {size['train']} | {size['valid']} | {size['test']} |", df["label"].value_counts().get(1, 0)/len(df), f"| {lang}")

AU | Google | Sentiment | 946 | 130 | 270 | 0.7347696879643388 | 0.9983530515281797
AU | Google | Sarcasm | 946 | 130 | 270 | 0.07280832095096583 | 0.9983530515281797
AU | Reddit | Sentiment | 1763 | 241 | 501 | 0.3193612774451098 | 0.9787314457212765
AU | Reddit | Sarcasm | 1763 | 241 | 501 | 0.42035928143712575 | 0.9787314457212765
IN | Google | Sentiment | 1648 | 225 | 469 | 0.7480785653287788 | 0.992057483665715
IN | Google | Sarcasm | 1647 | 225 | 469 | 0.007689021785561726 | 0.9920541094645431
IN | Reddit | Sentiment | 1685 | 230 | 479 | 0.2543859649122807 | 0.8755089748438211
IN | Reddit | Sarcasm | 1686 | 230 | 479 | 0.13319415448851774 | 0.8755605909933547
UK | Google | Sentiment | 1817 | 248 | 517 | 0.7482571649883811 | 0.9990422319617962
UK | Google | Sarcasm | 1821 | 249 | 518 | 0.00115919629057187 | 0.9990438435002125
UK | Reddit | Sentiment | 1007 | 138 | 287 | 0.11452513966480447 | 0.9749663745411852
UK | Reddit | Sarcasm | 1031 | 141 | 294 | 0.2203274215552524 | 0.97552