# Geo analysis

Estimate the geographical region of tweets 

In [1]:
import fasttext
import os
import pandas as pd
import random
import re
import sys
from IPython.display import clear_output

In [2]:
def listToLower(listOfStrings):
    return([x.lower() for x in listOfStrings])

def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

In [3]:
BELGIUMLIST = "België Belgique Belgium Belgia Belgio Belgien belgium Vlaanderen Belgie".split()
BELGIUMLIST.extend([])
BELGIUMLIST = listToLower(BELGIUMLIST)
NETHERLANDSLIST = "Nederland Netherlands Holanda Nederländerna Nederlandene Pays-Bas 🇳🇱 Holland NL nederland netherlands netherland".split()
NETHERLANDSLIST.extend(["The Netherlands","Pays Bas","Holanda (Países Baixos)","Paesi Bassi","the Netherlands","the netherlands",
                       "Nederland ","The Netherlands ","Kingdom of the Netherlands"])
NETHERLANDSLIST = listToLower(NETHERLANDSLIST)
BELGIUM = "Belgium"
NETHERLANDS = "Netherlands"
OTHER = "Other"
CITIES = "cities"
REGIONS = "regions"
COUNTRIES = "countries"
LOCATION = "location"
REGION = "region"
MUNICIPALITY = "municipality"
COUNTRY = "country"
NIELSEN = "nielsen"
USER = "user"

Sources of municipality names:
* https://nl.wikipedia.org/wiki/Tabel_van_Nederlandse_gemeenten
* https://nl.wikipedia.org/wiki/Lijst_van_gemeenten_in_het_Vlaams_Gewest
* https://nl.wikipedia.org/wiki/Lijst_van_gemeenten_in_het_Waals_Gewest
* https://nl.wikipedia.org/wiki/Lijst_van_gemeenten_in_het_Brussels_Hoofdstedelijk_Gewest
* https://iisg.amsterdam/en/hsn/data/place-names

In [4]:
REGIONFILE = "regions.csv"
MUNICIPALITYFILE = "municipalities.csv"
TOPONYMFILE = "ToponymsNL1812-2012SpatioTemporal.txt"
MUNICIPALITYFILE2 = "MunicipalitiesNL1812-2012SpatioTemporeel.txt"
MUNICIPALITY2 = "Municipality"
PROVINCE = "Province"
TOPONYM = "Toponym (city,town,village,hamlet)"

class nielsen:
    def getNielsenLocation(self,location):
        location = location.lower().strip()
        if location in self.cache: 
            return(self.cache[location])
        if location in BELGIUMLIST:
            self.cache[location] = BELGIUM
            return(self.cache[location])
        if location in NETHERLANDSLIST:
            self.cache[location] = NETHERLANDS
            return(self.cache[location])
        if location in self.municipalitiesLowered: 
            self.cache[location] = self.municipalitiesLowered[location]
            return(self.cache[location])
        if location in self.regionsLowered: 
            self.cache[location] = self.regionsLowered[location]
            return(self.cache[location])
        for splitChar in [","," "]:
            fields = location.split(splitChar)
            for i in range(1,len(fields)):
                locationPart = (splitChar.join(fields[:i])).strip()
                if locationPart in self.municipalitiesLowered: 
                    self.cache[location] = self.municipalitiesLowered[locationPart]
                    return(self.cache[location])
                if locationPart in self.regionsLowered: 
                    self.cache[location] = self.regionsLowered[locationPart]
                    return(self.cache[location])
                locationPart = (splitChar.join(fields[-i:])).strip()
                if locationPart in self.municipalitiesLowered: 
                    self.cache[location] = self.municipalitiesLowered[locationPart]
                    return(self.cache[location])
                if locationPart in self.regionsLowered: 
                    self.cache[location] = self.regionsLowered[locationPart]
                    return(self.cache[location])
        self.cache[location] = location
        return(self.cache[location])
    
    def readCsvFile(inFileName): return(pd.read_csv(inFileName))
    
    def getNielsenPerRegion(df,fieldName):
        return({df.iloc[i][fieldName].lower():",".join([df.iloc[i][COUNTRY],df.iloc[i][NIELSEN]]) 
                for i in range(0,len(df))})
    
    def readToponyms(inFileName,regionsLowered,municipalitiesLowered):
        df = pd.read_csv(inFileName,sep=";")
        for i in range(0,len(df)):
            region = df.iloc[i][PROVINCE].lower()
            toponym = df.iloc[i][TOPONYM].lower()
            municipality = df.iloc[i][MUNICIPALITY2].lower()
            if region in regionsLowered:
                if not toponym in municipalitiesLowered:
                    municipalitiesLowered[toponym] = regionsLowered[region]
                if not municipality in municipalitiesLowered:
                    municipalitiesLowered[municipality] = regionsLowered[region]
        return(municipalitiesLowered)

    def readMunicipalities(inFileName,regionsLowered,municipalitiesLowered):
        df = pd.read_csv(inFileName,sep=";")
        for i in range(0,len(df)):
            region = df.iloc[i][PROVINCE].lower()
            municipality = df.iloc[i][MUNICIPALITY].lower()
            if region in regionsLowered:
                if not municipality in municipalitiesLowered:
                    municipalitiesLowered[municipality] = regionsLowered[region]
        return(municipalitiesLowered)

    cache = {}
    municipalities = readCsvFile(MUNICIPALITYFILE)
    municipalitiesLowered = getNielsenPerRegion(municipalities,MUNICIPALITY)
    regions = readCsvFile(REGIONFILE)
    regionsLowered = getNielsenPerRegion(regions,REGION)
    municipalitiesLowered = readToponyms(TOPONYMFILE,regionsLowered,municipalitiesLowered)
    municipalitiesLowered = readMunicipalities(MUNICIPALITYFILE2,regionsLowered,municipalitiesLowered)

### Count tweets per user

In [5]:
TWEETTEXTDIR = "../data/text/"
FILEPATTERN = "202008"

users = {}
fileList = sorted(os.listdir(TWEETTEXTDIR))
for inFileName in fileList:
    if re.search(FILEPATTERN,inFileName):
        squeal(inFileName)
        usersInFile = pd.read_csv(TWEETTEXTDIR+inFileName).groupby(USER).groups
        for user in usersInFile:
            if not user in users: users[user] = 0
            users[user] += len(usersInFile[user])

20200831-23.out.gz


In [23]:
print("number of tweets:",sum([users[user] for user in users]))

number of tweets: 20314042


## Read country data

In [24]:
INFILENAME = "/home/erikt/tmp/locations-202008.txt"

inFile = open(INFILENAME,"r")
countries = []
screenNames = []
userIds = []
tweets = []
screenNamesSeen = {}
nielsenDistricts = []
nielsenClass = nielsen()
for line in inFile:
    fields = line.strip().split(",")
    userId = fields.pop(-1)
    screenName = fields.pop(-1)
    country = ",".join(fields)
    if screenName in users and not screenName.lower() in screenNamesSeen:
        nielsenDistrict = nielsenClass.getNielsenLocation(country)
        if nielsenDistrict in BELGIUMLIST: nielsenDistrict = BELGIUM
        if nielsenDistrict in NETHERLANDSLIST: nielsenDistrict = NETHERLANDS
        nielsenDistricts.append(nielsenDistrict)
        countries.append(country)
        userIds.append(userId)
        screenNames.append(screenNames)
        tweets.append(users[user])
        screenNamesSeen[screenName.lower()] = True
inFile.close()

In [25]:
len(nielsenDistricts),len(countries),len(screenNames),len(userIds),len(tweets),sum(tweets)

(814621, 814621, 814621, 814621, 814621, 73315890)

In [18]:
groups = pd.DataFrame(nielsenDistricts).groupby(0).groups
{key:len(groups[key]) for key in sorted(groups.keys(),key=lambda k:len(groups[k]),reverse=True) \
                      if re.search("Belgium|Netherlands",key)}

{'Netherlands,Nielsen II': 49049,
 'Netherlands,Nielsen I': 38466,
 'Netherlands,Nielsen IV': 37253,
 'Netherlands': 34767,
 'Netherlands,Nielsen V': 27500,
 'Belgium,Nielsen II': 24284,
 'Netherlands,Nielsen III': 19313,
 'Belgium': 16634,
 'Belgium,Nielsen I': 16410,
 'Belgium,Nielsen III': 6702,
 'Belgium,Nielsen V': 3526,
 'Belgium,Nielsen IV': 465}

In [11]:
# 'Netherlands,Nielsen II': 41814,
# 'Netherlands,Nielsen I': 38846,
# 'Netherlands,Nielsen III': 13891,
# 'Netherlands,Nielsen IV': 28545,
# 'Netherlands,Nielsen V': 23010,
# 'Netherlands': 34767,
    
# 'Belgium,Nielsen II': 22086,
# 'Belgium,Nielsen I': 16423,
# 'Belgium,Nielsen III': 6703,
# 'Belgium,Nielsen IV': 465,
# 'Belgium,Nielsen V': 3593,
# 'Belgium': 16634,

print("Netherlands Nielsen Old:",41814+38846+13891+28545+23010)
print("Netherlands Nielsen New:",49049+38466+37253+27500+19313)
print("gain:",171581/146106)
print("Netherlands all:",171581+34767,";",round(206348/841552,2),"of all users with location information")
print("Belgium all:",24284+16410+6702+3526+465+16634,";",round(68021/841552,2),"of all users with location information")

Netherlands Nielsen Old: 146106
Netherlands Nielsen New: 171581
gain: 1.1743597114423774
Netherlands all: 206348 ; 0.25 of all users
Belgium all: 68021 ; 0.08 of all users


In [10]:
countryTweets = {}
for i in range(0,len(countries)):
    if re.search("Belgium|Netherlands",countries[i]):
        if not countries[i] in countryTweets: countryTweets[countries[i]] = 0
        countryTweets[countries[i]] += tweets[i]
countryTweets

{'Netherlands,Nielsen V': 2475000,
 'Netherlands,Nielsen IV': 3352770,
 'Netherlands,Nielsen II': 4414410,
 'Netherlands,Nielsen III': 1738170,
 'Belgium,Nielsen I': 1476900,
 'Netherlands': 3129030,
 'Netherlands,Nielsen I': 3461940,
 'Belgium,Nielsen II': 2185560,
 'Belgium,Nielsen III': 603180,
 'Belgium': 1497060,
 'Belgium,Nielsen V': 317340,
 'Belgium,Nielsen IV': 41850}

In [12]:
print("Netherlands Nielsen:",2475000+3352770+4414410+1738170+3461940)
print("Netherlands all:",15442290+3129030,";",round(18571320/75739680,2),"of all tweets with location information")
print("Belgium all:",1476900+2185560+603180+317340+41850+1497060,";",round(6121890/75739680,2),"of all tweets with location information")

Netherlands Nielsen: 15442290
Netherlands all: 18571320 ; 0.25 of all tweets with location information
Belgium all: 6121890 ; 0.08 of all tweets with location information


In [22]:
rest = {key:len(groups[key]) for key in sorted(groups.keys(),key=lambda k:len(groups[k]),reverse=True) \
                             if not re.search("Belgium|Netherlands",key)}
restTotal = sum([rest[key] for key in rest])
print(restTotal)
rest

567183


{'she/her': 7020,
 'south africa': 3313,
 'france': 2724,
 'united states': 2471,
 'paris, france': 2330,
 '': 2325,
 'london, england': 2019,
 'india': 1979,
 'rio de janeiro, brasil': 1930,
 'brasil': 1791,
 'indonesia': 1781,
 'méxico': 1635,
 'são paulo, brasil': 1547,
 'argentina': 1481,
 'cape town, south africa': 1446,
 'johannesburg, south africa': 1441,
 'lagos, nigeria': 1438,
 'buenos aires, argentina': 1394,
 'los angeles, ca': 1386,
 'earth': 1179,
 'nigeria': 1140,
 'united kingdom': 1091,
 'malaysia': 1089,
 'deutschland': 1084,
 'london': 1060,
 'i̇stanbul, türkiye': 1049,
 'pretoria, south africa': 1035,
 'california, usa': 1000,
 'england, united kingdom': 994,
 'ile-de-france, france': 967,
 '📍 spain 🇪🇸': 937,
 '📍 españa 🇪🇸': 910,
 'españa': 902,
 'germany': 847,
 'he/him': 836,
 'canada': 818,
 'atlanta, ga': 816,
 'chicago, il': 799,
 'usa': 771,
 'colombia': 759,
 'houston, tx': 756,
 'paris': 745,
 'chile': 742,
 'europe': 724,
 'senegal': 721,
 'accra, ghana': 7

## Run fasttext experiment per tweet

In [None]:
LABELPREFIX = "__label__"
LARGEINT = 9999999999
TRAIN = "TRAIN"+str(int(random.random()*LARGEINT))
TEST = "TEST"+str(int(random.random()*LARGEINT))
VALIDATION = "VALIDATION"+str(int(random.random()*LARGEINT))

trainFile = open(TRAIN,"w")
testFile = open(TEST,"w")
validationFile = open(VALIDATION,"w")
validationData = []
testData = []
for i in range(0,round(len(countries)/10)):
    print(LABELPREFIX+countries[i],texts[i],file=testFile)
    testData.append(LABELPREFIX+countries[i]+" "+texts[i])
for i in range(round(len(countries)/10),round(2*len(countries)/10)):
    print(LABELPREFIX+countries[i],texts[i],file=validationFile)
    validationData.append(LABELPREFIX+countries[i]+" "+texts[i])
for i in range(round(2*len(countries)/10),len(countries)):
    print(LABELPREFIX+countries[i],texts[i],file=trainFile)
validationFile.close()
testFile.close()
trainFile.close()

In [None]:
for dim in [10,20,50,100,200,300]:
    for epoch in [10,20,50,100,200,300]:
        for lr in [0.05,0.1,0.2]:
            model = fasttext.train_supervised(TRAIN,dim=dim,epoch=epoch,lr=lr)
            print(dim,epoch,lr,model.test(VALIDATION))

In [None]:
10 10 0.05 (96576, 0.7760313121272365, 0.7760313121272365)
10 10 0.1 (96576, 0.7733701954937044, 0.7733701954937044)
10 10 0.2 (96576, 0.7715788601722995, 0.7715788601722995)
10 20 0.05 (96576, 0.7695079522862823, 0.7695079522862823)
10 20 0.1 (96576, 0.7672506626905236, 0.7672506626905236)
10 20 0.2 (96576, 0.7629742379058979, 0.7629742379058979)
10 50 0.05 (96576, 0.7603545394300861, 0.7603545394300861)
10 50 0.1 (96576, 0.7594640490390987, 0.7594640490390987)
10 50 0.2 (96576, 0.7574656229290921, 0.7574656229290921)
10 100 0.05 (96576, 0.7555396785950961, 0.7555396785950961)
10 100 0.1 (96576, 0.7547423790589795, 0.7547423790589795)
10 100 0.2 (96576, 0.7534377070907886, 0.7534377070907886)
10 200 0.05 (96576, 0.7507558813783963, 0.7507558813783963)
10 200 0.1 (96576, 0.7494719184890656, 0.7494719184890656)
10 200 0.2 (96576, 0.7495651093439364, 0.7495651093439364)
10 300 0.05 (96576, 0.7471732107355865, 0.7471732107355865)
10 300 0.1 (96576, 0.7471317925778661, 0.7471317925778661)
10 300 0.2 (96576, 0.7491923459244533, 0.7491923459244533)
20 10 0.05 (96576, 0.777884774685222, 0.777884774685222)
20 10 0.1 (96576, 0.7735255135851558, 0.7735255135851558)
20 10 0.2 (96576, 0.7726143141153081, 0.7726143141153081)
20 20 0.05 (96576, 0.7678305168986084, 0.7678305168986084)
20 20 0.1 (96576, 0.7667536447978794, 0.7667536447978794)
20 20 0.2 (96576, 0.763823310139165, 0.763823310139165)
20 50 0.05 (96576, 0.7601681577203446, 0.7601681577203446)
20 50 0.1 (96576, 0.7599507123923128, 0.7599507123923128)
20 50 0.2 (96576, 0.758169731610338, 0.758169731610338)
20 100 0.05 (96576, 0.7558088966202783, 0.7558088966202783)
20 100 0.1 (96576, 0.7542867793240556, 0.7542867793240556)
20 100 0.2 (96576, 0.7545663518886679, 0.7545663518886679)
20 200 0.05 (96576, 0.7506419814446653, 0.7506419814446653)
20 200 0.1 (96576, 0.7496272365805169, 0.7496272365805169)
20 200 0.2 (96576, 0.7499482273028496, 0.7499482273028496)
20 300 0.05 (96576, 0.7481361829025845, 0.7481361829025845)
20 300 0.1 (96576, 0.7469143472498343, 0.7469143472498343)
20 300 0.2 (96576, 0.7488920642809808, 0.7488920642809808)
50 10 0.05 (96576, 0.7768079025844931, 0.7768079025844931)
50 10 0.1 (96576, 0.7724175778661365, 0.7724175778661365)
50 10 0.2 (96576, 0.7727696322067594, 0.7727696322067594)
50 20 0.05 (96576, 0.7678822895957588, 0.7678822895957588)
50 20 0.1 (96576, 0.7673438535453942, 0.7673438535453942)
50 20 0.2 (96576, 0.764775927766733, 0.764775927766733)
50 50 0.05 (96576, 0.760696239231279, 0.760696239231279)
50 50 0.1 (96576, 0.7589463220675944, 0.7589463220675944)
50 50 0.2 (96576, 0.7583664678595096, 0.7583664678595096)
50 100 0.05 (96576, 0.7560159874088801, 0.7560159874088801)
50 100 0.1 (96576, 0.7537690523525513, 0.7537690523525513)
50 100 0.2 (96576, 0.7543903247183565, 0.7543903247183565)
50 200 0.05 (96576, 0.74879887342611, 0.74879887342611)
50 200 0.1 (96576, 0.7493166003976143, 0.7493166003976143)
50 200 0.2 (96576, 0.7503002816434725, 0.7503002816434725)
50 300 0.05 (96576, 0.7481361829025845, 0.7481361829025845)
50 300 0.1 (96576, 0.7482811464546058, 0.7482811464546058)
50 300 0.2 (96576, 0.7503002816434725, 0.7503002816434725)
100 10 0.05 (96576, 0.7770564115308151, 0.7770564115308151)
100 10 0.1 (96576, 0.7752236580516899, 0.7752236580516899)
100 10 0.2 (96576, 0.7721069416832339, 0.7721069416832339)
100 20 0.05 (96576, 0.7670849900596421, 0.7670849900596421)
100 20 0.1 (96576, 0.7645584824387012, 0.7645584824387012)
100 20 0.2 (96576, 0.7649726640159046, 0.7649726640159046)
100 50 0.05 (96576, 0.7614831842279656, 0.7614831842279656)
100 50 0.1 (96576, 0.7600231941683234, 0.7600231941683234)
100 50 0.2 (96576, 0.7591534128561962, 0.7591534128561962)
100 100 0.05 (96576, 0.7554154241219351, 0.7554154241219351)
100 100 0.1 (96576, 0.7544731610337972, 0.7544731610337972)
100 100 0.2 (96576, 0.7547423790589795, 0.7547423790589795)
100 200 0.05 (96576, 0.7503106361829026, 0.7503106361829026)
100 200 0.1 (96576, 0.7503831179589132, 0.7503831179589132)
100 200 0.2 (96576, 0.7509526176275679, 0.7509526176275679)
100 300 0.05 (96576, 0.7479394466534128, 0.7479394466534128)
100 300 0.1 (96576, 0.7476081013916501, 0.7476081013916501)
100 300 0.2 (96576, 0.7490266732935719, 0.7490266732935719)
200 10 0.05 (96576, 0.7748301855533466, 0.7748301855533466)
200 10 0.1 (96576, 0.7721897779986746, 0.7721897779986746)
200 10 0.2 (96576, 0.7729145957587806, 0.7729145957587806)
200 20 0.05 (96576, 0.7685553346587144, 0.7685553346587144)
200 20 0.1 (96576, 0.7664533631544069, 0.7664533631544069)
200 20 0.2 (96576, 0.7651072730284957, 0.7651072730284957)
200 50 0.05 (96576, 0.7612036116633533, 0.7612036116633533)
200 50 0.1 (96576, 0.7594122763419483, 0.7594122763419483)
200 50 0.2 (96576, 0.7585735586481114, 0.7585735586481114)
200 100 0.05 (96576, 0.7555500331345262, 0.7555500331345262)
200 100 0.1 (96576, 0.7531995526838966, 0.7531995526838966)
200 100 0.2 (96576, 0.7537069251159708, 0.7537069251159708)
200 200 0.05 (96576, 0.7505694996686547, 0.7505694996686547)
200 200 0.1 (96576, 0.7487160371106694, 0.7487160371106694)
200 200 0.2 (96576, 0.7502795725646123, 0.7502795725646123)
200 300 0.05 (96576, 0.7476391650099403, 0.7476391650099403)
200 300 0.1 (96576, 0.7474113651424784, 0.7474113651424784)
200 300 0.2 (96576, 0.7491923459244533, 0.7491923459244533)
300 10 0.05 (96576, 0.7755653578528827, 0.7755653578528827)
300 10 0.1 (96576, 0.7753479125248509, 0.7753479125248509)
300 10 0.2 (96576, 0.7723865142478462, 0.7723865142478462)
300 20 0.05 (96576, 0.767312789927104, 0.767312789927104)
300 20 0.1 (96576, 0.7669917992047713, 0.7669917992047713)
300 20 0.2 (96576, 0.7659977634194831, 0.7659977634194831)
300 50 0.05 (96576, 0.7603648939695162, 0.7603648939695162)
300 50 0.1 (96576, 0.7593501491053678, 0.7593501491053678)
300 50 0.2 (96576, 0.7592880218687873, 0.7592880218687873)
300 100 0.05 (96576, 0.755280815109344, 0.755280815109344)
300 100 0.1 (96576, 0.7545974155069582, 0.7545974155069582)
300 100 0.2 (96576, 0.754328197481776, 0.754328197481776)
300 200 0.05 (96576, 0.7500621272365805, 0.7500621272365805)
300 200 0.1 (96576, 0.7499275182239894, 0.7499275182239894)
300 200 0.2 (96576, 0.7490266732935719, 0.7490266732935719)
300 300 0.05 (96576, 0.7475045559973492, 0.7475045559973492)
300 300 0.1 (96576, 0.7483950463883366, 0.7483950463883366)
300 300 0.2 (96576, 0.7492337640821736, 0.7492337640821736)

best: 20 10 0.05 (96576, 0.777884774685222, 0.777884774685222) ...

In [None]:
os.unlink(TRAIN)
os.unlink(TEST)
os.unlink(VALIDATION)

## Run fasttext experiment per user

In [None]:
userDict = {}
for i in range(0,len(users)):
    user = users[i]
    if user in userDict: userDict[user] += " "+texts[i]
    else: userDict[user] = LABELPREFIX+countries[i]+" "+texts[i]

In [None]:
trainFile = open(TRAIN,"w")
testFile = open(TEST,"w")
validationFile = open(VALIDATION,"w")
userTextList = list(userDict.values())
for i in range(0,round(len(userTextList)/10)):
    print(userTextList[i],file=testFile)
for i in range(round(len(userTextList)/10),round(2*len(userTextList)/10)):
    print(userTextList[i],file=validationFile)
for i in range(round(2*len(userTextList)/10),len(userTextList)):
    print(userTextList[i],file=trainFile)
validationFile.close()
testFile.close()
trainFile.close()

In [None]:
for dim in [10,20,50,100,200,300]:
    for epoch in [10,20,50,100,200,300]:
        for lr in [0.05,0.1,0.2]:
            model = fasttext.train_supervised(TRAIN,dim=dim,epoch=epoch,lr=lr)
            print(dim,epoch,lr,model.test(VALIDATION))

10 10 0.05 (10432, 0.8002300613496932, 0.8002300613496932)
10 10 0.1 (10432, 0.80329754601227, 0.80329754601227)
10 10 0.2 (10432, 0.8001342024539877, 0.8001342024539877)
10 20 0.05 (10432, 0.8029141104294478, 0.8029141104294478)
10 20 0.1 (10432, 0.8008052147239264, 0.8008052147239264)
10 20 0.2 (10432, 0.7972584355828221, 0.7972584355828221)
10 50 0.05 (10432, 0.7994631901840491, 0.7994631901840491)
10 50 0.1 (10432, 0.7991756134969326, 0.7991756134969326)
10 50 0.2 (10432, 0.7951495398773006, 0.7951495398773006)
10 100 0.05 (10432, 0.7986004601226994, 0.7986004601226994)
10 100 0.1 (10432, 0.7962039877300614, 0.7962039877300614)
10 100 0.2 (10432, 0.7950536809815951, 0.7950536809815951)
10 200 0.05 (10432, 0.7951495398773006, 0.7951495398773006)
10 200 0.1 (10432, 0.7957246932515337, 0.7957246932515337)
10 200 0.2 (10432, 0.7941909509202454, 0.7941909509202454)
10 300 0.05 (10432, 0.7958205521472392, 0.7958205521472392)
10 300 0.1 (10432, 0.7950536809815951, 0.7950536809815951)
10 300 0.2 (10432, 0.7937116564417178, 0.7937116564417178)
20 10 0.05 (10432, 0.8030099693251533, 0.8030099693251533)
20 10 0.1 (10432, 0.8061733128834356, 0.8061733128834356)
20 10 0.2 (10432, 0.8024348159509203, 0.8024348159509203)
20 20 0.05 (10432, 0.8048312883435583, 0.8048312883435583)
20 20 0.1 (10432, 0.8005176380368099, 0.8005176380368099)
20 20 0.2 (10432, 0.7969708588957055, 0.7969708588957055)
20 50 0.05 (10432, 0.8002300613496932, 0.8002300613496932)
20 50 0.1 (10432, 0.7991756134969326, 0.7991756134969326)
20 50 0.2 (10432, 0.7965874233128835, 0.7965874233128835)
20 100 0.05 (10432, 0.7987921779141104, 0.7987921779141104)
20 100 0.1 (10432, 0.7958205521472392, 0.7958205521472392)
20 100 0.2 (10432, 0.7965874233128835, 0.7965874233128835)
20 200 0.05 (10432, 0.7976418711656442, 0.7976418711656442)
20 200 0.1 (10432, 0.7960122699386503, 0.7960122699386503)
20 200 0.2 (10432, 0.7944785276073619, 0.7944785276073619)
20 300 0.05 (10432, 0.7952453987730062, 0.7952453987730062)
20 300 0.1 (10432, 0.7962039877300614, 0.7962039877300614)
20 300 0.2 (10432, 0.7940950920245399, 0.7940950920245399)
50 10 0.05 (10432, 0.8045437116564417, 0.8045437116564417)
50 10 0.1 (10432, 0.8061733128834356, 0.8061733128834356)
50 10 0.2 (10432, 0.8020513803680982, 0.8020513803680982)
50 20 0.05 (10432, 0.8047354294478528, 0.8047354294478528)
50 20 0.1 (10432, 0.8006134969325154, 0.8006134969325154)
50 20 0.2 (10432, 0.7985046012269938, 0.7985046012269938)
50 50 0.05 (10432, 0.8006134969325154, 0.8006134969325154)
50 50 0.1 (10432, 0.7986004601226994, 0.7986004601226994)
50 50 0.2 (10432, 0.7957246932515337, 0.7957246932515337)
50 100 0.05 (10432, 0.7958205521472392, 0.7958205521472392)
50 100 0.1 (10432, 0.7980253067484663, 0.7980253067484663)
50 100 0.2 (10432, 0.7959164110429447, 0.7959164110429447)
50 200 0.05 (10432, 0.7979294478527608, 0.7979294478527608)
50 200 0.1 (10432, 0.7975460122699386, 0.7975460122699386)
50 200 0.2 (10432, 0.7962039877300614, 0.7962039877300614)
50 300 0.05 (10432, 0.7956288343558282, 0.7956288343558282)
50 300 0.1 (10432, 0.7940950920245399, 0.7940950920245399)
50 300 0.2 (10432, 0.7943826687116564, 0.7943826687116564)
100 10 0.05 (10432, 0.803489263803681, 0.803489263803681)
100 10 0.1 (10432, 0.8043519938650306, 0.8043519938650306)
100 10 0.2 (10432, 0.8009969325153374, 0.8009969325153374)
100 20 0.05 (10432, 0.8020513803680982, 0.8020513803680982)
100 20 0.1 (10432, 0.7983128834355828, 0.7983128834355828)
100 20 0.2 (10432, 0.7984087423312883, 0.7984087423312883)
100 50 0.05 (10432, 0.8000383435582822, 0.8000383435582822)
100 50 0.1 (10432, 0.7992714723926381, 0.7992714723926381)
100 50 0.2 (10432, 0.796875, 0.796875)
100 100 0.05 (10432, 0.7964915644171779, 0.7964915644171779)
100 100 0.1 (10432, 0.7955329754601227, 0.7955329754601227)
100 100 0.2 (10432, 0.7960122699386503, 0.7960122699386503)
100 200 0.05 (10432, 0.7960122699386503, 0.7960122699386503)
100 200 0.1 (10432, 0.7950536809815951, 0.7950536809815951)
100 200 0.2 (10432, 0.7964915644171779, 0.7964915644171779)
100 300 0.05 (10432, 0.796875, 0.796875)
100 300 0.1 (10432, 0.7941909509202454, 0.7941909509202454)
100 300 0.2 (10432, 0.7929447852760736, 0.7929447852760736)
200 10 0.05 (10432, 0.8029141104294478, 0.8029141104294478)
200 10 0.1 (10432, 0.8048312883435583, 0.8048312883435583)
200 10 0.2 (10432, 0.8005176380368099, 0.8005176380368099)
200 20 0.05 (10432, 0.8028182515337423, 0.8028182515337423)
200 20 0.1 (10432, 0.8020513803680982, 0.8020513803680982)
200 20 0.2 (10432, 0.7971625766871165, 0.7971625766871165)
200 50 0.05 (10432, 0.8004217791411042, 0.8004217791411042)
200 50 0.1 (10432, 0.7962039877300614, 0.7962039877300614)
200 50 0.2 (10432, 0.7985046012269938, 0.7985046012269938)
200 100 0.05 (10432, 0.7976418711656442, 0.7976418711656442)
200 100 0.1 (10432, 0.7982170245398773, 0.7982170245398773)
200 100 0.2 (10432, 0.7964915644171779, 0.7964915644171779)
200 200 0.05 (10432, 0.796875, 0.796875)
200 200 0.1 (10432, 0.7948619631901841, 0.7948619631901841)
200 200 0.2 (10432, 0.7963957055214724, 0.7963957055214724)
200 300 0.05 (10432, 0.7972584355828221, 0.7972584355828221)
200 300 0.1 (10432, 0.7948619631901841, 0.7948619631901841)
200 300 0.2 (10432, 0.7947661042944786, 0.7947661042944786)
300 10 0.05 (10432, 0.8022430981595092, 0.8022430981595092)
300 10 0.1 (10432, 0.8040644171779141, 0.8040644171779141)
300 10 0.2 (10432, 0.8007093558282209, 0.8007093558282209)
300 20 0.05 (10432, 0.8043519938650306, 0.8043519938650306)
300 20 0.1 (10432, 0.8002300613496932, 0.8002300613496932)
300 20 0.2 (10432, 0.7974501533742331, 0.7974501533742331)
300 50 0.05 (10432, 0.7983128834355828, 0.7983128834355828)
300 50 0.1 (10432, 0.7974501533742331, 0.7974501533742331)
300 50 0.2 (10432, 0.7931365030674846, 0.7931365030674846)
300 100 0.05 (10432, 0.7993673312883436, 0.7993673312883436)
300 100 0.1 (10432, 0.7957246932515337, 0.7957246932515337)
300 100 0.2 (10432, 0.7939033742331288, 0.7939033742331288)
300 200 0.05 (10432, 0.7944785276073619, 0.7944785276073619)
300 200 0.1 (10432, 0.7956288343558282, 0.7956288343558282)
300 200 0.2 (10432, 0.7963957055214724, 0.7963957055214724)
300 300 0.05 (10432, 0.7967791411042945, 0.7967791411042945)
300 300 0.1 (10432, 0.7944785276073619, 0.7944785276073619)
300 300 0.2 (10432, 0.7945743865030674, 0.7945743865030674)

## Check friends of users

In [None]:
COUNTRYFILE = "/home/erikt/tmp/countries-202009-week1.txt"
UNKNOWN = "-"

idStrs = {}
screenNames = {}
countries = {}
inFile = open(COUNTRYFILE,"r")
for line in inFile:
    fields = line.strip().split(",")
    userId = fields.pop(-1)
    screenName = fields.pop(-1)
    country = ",".join(fields)
    screenNames[userId] = screenName
    idStrs[screenName] = userId
    if country in BELGIUMLIST: 
        if not userId in countries: countries[userId] = [BELGIUM]
        elif not BELGIUM in countries[userId]: countries[userId].append(BELGIUM) 
    elif country in NETHERLANDSLIST: 
        if not userId in countries: countries[userId] = [NETHERLANDS]
        elif not NETHERLANDS in countries[userId]: countries[userId].append(NETHERLANDS) 
    elif country != UNKNOWN:
        if not userId in countries: countries[userId] = [OTHER]
        elif not OTHER in countries[userId]: countries[userId].append(OTHER)         
inFile.close()

In [None]:
FRIENDSFILE = "getFriends.py.out"

countriesFound = {}
inFile = open(FRIENDSFILE,"r")
countriesFound = {}
for line in inFile:
    userIds = line.strip().split()
    screenName = userIds.pop(0)
    countriesFound[screenName] = {}
    for userId in userIds:
        if userId in countries:
            countryString = " ".join(sorted(countries[userId]))
            if not countryString in countriesFound[screenName]: countriesFound[screenName][countryString] = 0
            countriesFound[screenName][countryString] += 1
    countriesFound[screenName] = {k:countriesFound[screenName][k] 
                                  for k in sorted(countriesFound[screenName].keys(),
                                                  key=lambda k:countriesFound[screenName][k],reverse=True)}
inFile.close()

In [None]:
MINFRIENDS = 5
MINFACTOR = 2

correctGuesses = 0
incorrectGuesses = 0
for screenName in countriesFound:
    if screenName in idStrs and idStrs[screenName] in countries:
        totalLabeledFriends = sum(countriesFound[screenName].values())
        if totalLabeledFriends >= MINFRIENDS:
            guessedLabel = UNKNOWN
            topLabel = list(countriesFound[screenName].keys())[0]
            if len(countriesFound[screenName]) == 1: guessedLabel = topLabel
            else:
                secondLabel = list(countriesFound[screenName].keys())[1]
                if countriesFound[screenName][topLabel] >= MINFACTOR*countriesFound[screenName][secondLabel]: guessedLabel = topLabel
            if guessedLabel != UNKNOWN:
                if guessedLabel in countries[idStrs[screenName]]: correctGuesses += 1
                else: 
                    incorrectGuesses += 1
                    print(guessedLabel,countries[idStrs[screenName]])
print(f"{correctGuesses/(correctGuesses+incorrectGuesses)} {correctGuesses} {incorrectGuesses}")