In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
from collections import Counter
import math
import re
import json
import subprocess
import time
import pickle

import pandas as pd
import umap
from tqdm.autonotebook import tqdm, trange
from nltk.tokenize import word_tokenize

import numpy as np
import scipy.stats
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score
from sklearn.utils import class_weight

from tensorflow import keras

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
#assert gpus
try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight
layers = keras.layers
models = keras.models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
import numpy.random as rng
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint

import convert_model

import language
import text_nn
import grab_category
import news
import groups
import libs.cpp_stuff as cpp

In [4]:
lang = "en"
train_folders = [os.path.join("data", folder) for folder in ["sample", "sample2", "sample3", "sample4", "sample5", "sample6"]]
file_info = []
for folder in train_folders:
    file_info.extend(language.read_dump(f"{folder}/langs/{lang}"))

In [57]:
cities_data = pd.read_excel("data/WUP2018-F12-Cities_Over_300K.xls", skiprows=16)
geo_to_country = dict(zip(cities_data["Urban Agglomeration"], cities_data["Country or area"]))
for geo, country in geo_to_country.items():
    if "(" in country:
        country = country.split(" (")[0]
    
    geo_to_country[geo] = country
    
geo_to_country["USA"] = 'United States of America'
geo_to_country["United States"] = 'United States of America'
geo_to_country["U.S."] = 'United States of America'
geo_to_country["New York"] = 'United States of America'
geo_to_country["UK"] = 'United Kingdom'
geo_to_country["Bosnia"] = 'Bosnia and Herzegovina'
geo_to_country["Hong Kong"] = 'China, Hong Kong SAR'
geo_to_country["Macao"] = 'China, Macao SAR'
geo_to_country["Taiwan"] = 'China, Taiwan Province of China'
geo_to_country["Russia"] = 'Russian Federation'
geo_to_country["Syria"] = 'Syrian Arab Republic'
geo_to_country["Vietnam"] = 'Viet Nam'
geo_to_country["crore"] = 'India'

for country in list(geo_to_country.values()):
    geo_to_country[country] = country

for geo, country in list(geo_to_country.items()):
    if "(" in geo:
        del geo_to_country[geo]
        geo1 = geo.split("(")[0].strip()
        geo_to_country[geo1] = country
        geo2 = geo.split("(")[1].replace(")", "").replace("incl. ", "").replace("including ", "")
        geo_to_country[geo2] = country
        
    if ", " in geo and "China" not in geo:
        del geo_to_country[geo]
        for n in geo.split(", "):
            geo_to_country[n] = country     
            
for geo in ["Van", "Tours"]:
    del geo_to_country[geo]

In [58]:
country_regex = {c: "" for c in geo_to_country.values()}
for geo, country in geo_to_country.items():
    if geo[-1] == ".":
        geo = geo[:-1]
        
    country_regex[country] += "|" + geo
    
for country, regex in country_regex.items():
    country_regex[country] = re.compile(r"\b(" + regex[1:].replace(".", "\.") + r")n?\b", re.IGNORECASE)
    
country_regex

{'Afghanistan': re.compile(r'\b(Herat|Kabul|Kandahar|Mazar-e Sharif|Afghanistan)n?\b',
 re.IGNORECASE|re.UNICODE),
 'Algeria': re.compile(r'\b(Annaba|Batna|Blida|El Djelfa|Qacentina|Sétif|Algeria|El Djazaïr|Algiers|Wahran|Oran)n?\b',
 re.IGNORECASE|re.UNICODE),
 'Angola': re.compile(r'\b(Benguela|Cabinda|Cuito|Huambo|Lobito|Luanda|Lubango|Malanje|Uige|Angola)n?\b',
 re.IGNORECASE|re.UNICODE),
 'Argentina': re.compile(r'\b(Bahia Blanca|Buenos Aires|Corrientes|La Plata|Mar Del Plata|Mendoza|Neuquén-Plottier-Cipolletti|Posadas|Resistencia|Rosario|Salta|San Miguel de Tucumán|San Salvador de Jujuy|Santa Fe|Santiago Del Estero|Argentina)n?\b',
 re.IGNORECASE|re.UNICODE),
 'Mexico': re.compile(r'\b(Córdoba|Acapulco de Juárez|Aguascalientes|Cancún|Celaya|Chihuahua|Ciudad Juárez|Ciudad Obregón|Ciudad Victoria|Coatzacoalcos|Colima|Cuautla Morelos|Cuernavaca|Culiacán|Durango|Ensenada|Guadalajara|Hermosillo|Irapuato|La Laguna|León de los Aldamas|Los Mochis|Matamoros|Mazatlán|Mérida|Mexicali|Minati

In [64]:
source_country = {}
for fi in tqdm(file_info):
    if fi.site not in source_country:
        source_country[fi.site] = Counter()
        
    for country, regex in country_regex.items():
        if regex.search(fi.text + " " + fi.title):
            source_country[fi.site][country] += 1

HBox(children=(FloatProgress(value=0.0, max=394408.0), HTML(value='')))




In [80]:
country_list = list(country_regex.keys())
source_country_frequency = {}
for source, countries in source_country.items():
    total = sum(countries.values())
    source_country_frequency[source] = [countries[c] / total if total > 0 else 0 for c in country_list]

In [None]:
for source, freqs in list(source_country_frequency.items())[:50]:
    print(source)
    countries_by_freq = list(zip(country_list, freqs))
    countries_by_freq.sort(key=lambda x: x[1], reverse=True)
    for c, f in countries_by_freq[:5]:
        print(f"\t{c}: {f}")

In [77]:
with open("data/source_country.tsv", "w") as f:
    for source, freqs in source_country_frequency.items():
        countries_by_freq = list(zip(country_list, freqs))
        countries_by_freq.sort(key=lambda x: x[1], reverse=True)
        top_country = countries_by_freq[0][0]
        f.write(f"{source}\t{top_country}\n")

In [83]:
with open("data/source_country_frequencies.json", "w") as f:
    json.dump([source_country_frequency, country_list], f, indent=2)