In [2]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

In [3]:
def get_articles(cat_main):
    
    articles_all = []
    
    for p in cat_main.categorymembers.values():

        if p.namespace == wikipediaapi.Namespace.CATEGORY and ('painter' in p.title or 'Painter' in p.title):
            articles_all.extend(get_articles(p))
        
        elif p.namespace == wikipediaapi.Namespace.MAIN:
            articles_all.append(p)

    return articles_all

In [4]:
def get_df_for_cat_value(value_cat_lst, clean_func, valueName):
    
    pages_all = []
    pages_name_all = []
    value_all = []

    for value in tqdm(value_cat_lst):
        
        pages_oneV = get_articles(value)
        pages_name_oneV = [p.title for p in pages_oneV]

        value_clean = clean_func(value)
        value_oneV = [value_clean for p in pages_oneV]

           
        pages_all.extend(pages_oneV)
        pages_name_all.extend(pages_name_oneV)
        value_all.extend(value_oneV)
    
    print('Number of pages before deleting duplicates: ', len(pages_all))
    
    columns = ['page', 'page name', valueName]
    df = pd.DataFrame(columns=columns) 

    df['page'] = pages_all
    df['page name'] = pages_name_all
    df[valueName] = value_all
    
    df = df.groupby('page name').agg({'page':'first', 
                                       valueName: set}).reset_index()
    
    df[valueName] = df[valueName].apply(list)

    
    return df

## Nationalities

In [5]:
def clean_nationality(nat_cat):
    nat = nat_cat.title
    
    nat = nat.split('Category:')[1]
    
    if 'from Georgia' in nat:
        return 'Georgian'
    if 'from the Principality' in nat:
        return 'Liège'
    return nat.split(' painters')[0]

In [6]:
nationalities_cat_lst = []

for nat in  wiki_wiki.page("Category:Painters_by_nationality").categorymembers.values():
    nationalities_cat_lst.append(nat)
    
nationalities_cat_lst = nationalities_cat_lst[8:]

In [7]:
df_nationality = get_df_for_cat_value(value_cat_lst=nationalities_cat_lst, 
                                      clean_func=clean_nationality, 
                                      valueName='nationality')

100%|████████████████████████████████████████████████████████████████████████████████| 170/170 [05:19<00:00,  1.88s/it]


Number of pages before deleting duplicates:  131442


In [8]:
df_nationality

Unnamed: 0,page name,page,nationality
0,'Abd al-Hayy,"'Abd al-Hayy (id: ??, ns: 0)","[Persian, Iranian]"
1,'the other' Jan van Kessel,"'the other' Jan van Kessel (id: ??, ns: 0)",[Flemish]
2,108 (artist),"108 (artist) (id: ??, ns: 0)",[Italian]
3,3Steps,"3Steps (id: ??, ns: 0)",[German]
4,A. A. Raiba,"A. A. Raiba (id: ??, ns: 0)",[Indian]
...,...,...,...
37045,Živorad Nastasijević,"Živorad Nastasijević (id: ??, ns: 0)",[Serbian]
37046,Ștefan Câlția,"Ștefan Câlția (id: ??, ns: 0)",[Romanian]
37047,Ștefan Dimitrescu,"Ștefan Dimitrescu (id: ??, ns: 0)",[Romanian]
37048,Ștefan Luchian,"Ștefan Luchian (id: ??, ns: 0)",[Romanian]


## Dates and Other Info

In [9]:
def get_dates(page):
    cats = page.categories.keys()

    death_year, birth_year = None, None
    
    for c in cats:

        if 'births' in c:
            match = re.search(r'\d{4}', c) 
            if match: birth_year = match[0]

        if 'deaths' in c:
            match = re.search(r'\d{4}', c) 
            if match: death_year = match[0]

    return [birth_year, death_year, page.fullurl, list(cats)]

In [20]:

    births = []
    deaths = []
    urls = []
    categories_all = []


    for p in tqdm(df_nationality[21719:]['page']):
        b, d, u, cs = get_dates(p)

        births.append(b)
        deaths.append(d)
        urls.append(u)
        categories_all.append(cs)
        
        

100%|██████████████████████████████████████████████████████████████████████████| 15331/15331 [1:41:27<00:00,  2.52it/s]


In [21]:
len(births)

15331

## Other Info

# Save

In [11]:
def clean_title(title):
    if ' (' in title:
        return title.split(' (')[0]
    return title

In [22]:
columns = ['page', 'page name', 'painter name', 'url', 'birth', 'death', 'nationality', 'categories']
df = pd.DataFrame(columns=columns)


df['page'] = df_nationality['page']
df['page name'] = df_nationality['page name']
df['painter name'] = df_nationality['page name'].apply(clean_title)
df['url'] = urls
df['birth'] = births
df['death'] = deaths
df['nationality'] = df_nationality['nationality']
df['categories'] = categories_all


df

Unnamed: 0,page,page name,painter name,url,birth,death,nationality,categories
21719,"Katharina Rapp (id: 1628554, ns: 0)",Katharina Rapp,Katharina Rapp,https://en.wikipedia.org/wiki/Katharina_Rapp,1948,,[German],"[Category:1948 births, Category:20th-century G..."
21720,"Katharine Carl (id: 8544717, ns: 0)",Katharine Carl,Katharine Carl,https://en.wikipedia.org/wiki/Katharine_Carl,1865,1938,[American],"[Category:1865 births, Category:1938 deaths, C..."
21721,"Katharine Church (id: 56389959, ns: 0)",Katharine Church,Katharine Church,https://en.wikipedia.org/wiki/Katharine_Church,1910,1999,[British],"[Category:1910 births, Category:1999 deaths, C..."
21722,"Katharine Emma Maltwood (id: 10726642, ns: 0)",Katharine Emma Maltwood,Katharine Emma Maltwood,https://en.wikipedia.org/wiki/Katharine_Emma_M...,1878,1961,[British],"[Category:1878 births, Category:1961 deaths, C..."
21723,"Katharine Lane Weems (id: 11172863, ns: 0)",Katharine Lane Weems,Katharine Lane Weems,https://en.wikipedia.org/wiki/Katharine_Lane_W...,1899,1989,[American],"[Category:1899 births, Category:1989 deaths, C..."
...,...,...,...,...,...,...,...,...
37045,"Živorad Nastasijević (id: 62254857, ns: 0)",Živorad Nastasijević,Živorad Nastasijević,https://en.wikipedia.org/wiki/%C5%BDivorad_Nas...,1893,1966,[Serbian],"[Category:1893 births, Category:1966 deaths, C..."
37046,"Ștefan Câlția (id: 13409891, ns: 0)",Ștefan Câlția,Ștefan Câlția,https://en.wikipedia.org/wiki/%C8%98tefan_C%C3...,1942,,[Romanian],"[Category:1942 births, Category:All stub artic..."
37047,"Ștefan Dimitrescu (id: 6000470, ns: 0)",Ștefan Dimitrescu,Ștefan Dimitrescu,https://en.wikipedia.org/wiki/%C8%98tefan_Dimi...,1886,1933,[Romanian],"[Category:1886 births, Category:1933 deaths, C..."
37048,"Ștefan Luchian (id: 2475912, ns: 0)",Ștefan Luchian,Ștefan Luchian,https://en.wikipedia.org/wiki/%C8%98tefan_Luchian,1868,1917,[Romanian],"[Category:1868 births, Category:1917 deaths, C..."


In [23]:
df.to_csv('Data_GoogleArts_V1.csv')

## Load Test

In [2]:
df_tmp = pd.read_csv('Data_GoogleArts_V1_0_21719.csv', index_col=0)
df_tmp.head()

Unnamed: 0,page,page name,painter name,url,birth,death,nationality,categories
0,"'Abd al-Hayy (id: 39316086, ns: 0)",'Abd al-Hayy,'Abd al-Hayy,https://en.wikipedia.org/wiki/%27Abd_al-Hayy,1374.0,1405.0,"['Persian', 'Iranian']","['Category:1374 births', 'Category:1405 deaths..."
1,"'the other' Jan van Kessel (id: 52935555, ns: 0)",'the other' Jan van Kessel,'the other' Jan van Kessel,https://en.wikipedia.org/wiki/%27the_other%27_...,1620.0,,['Flemish'],"['Category:1620 births', 'Category:Animal arti..."
2,"108 (artist) (id: 18446241, ns: 0)",108 (artist),108,https://en.wikipedia.org/wiki/108_(artist),1978.0,,['Italian'],"['Category:1978 births', 'Category:20th-centur..."
3,"3Steps (id: 45689974, ns: 0)",3Steps,3Steps,https://en.wikipedia.org/wiki/3Steps,,,['German'],"['Category:Articles with hCards', 'Category:Ge..."
4,"A. A. Raiba (id: 5402895, ns: 0)",A. A. Raiba,A. A. Raiba,https://en.wikipedia.org/wiki/A._A._Raiba,1922.0,2016.0,['Indian'],"['Category:1922 births', 'Category:2016 deaths..."


In [32]:
wiki_wiki = wikipediaapi.Wikipedia('en',extract_format=wikipediaapi.ExtractFormat.WIKI)


"3Steps (pronounced:  θriː stɛps) is a German-based contemporary artist collective between the twins Kai Harald Krieger and Uwe Harald Krieger (born March 15, 1980) and Joachim Pitt (born December 8, 1980).\nThe works of 3Steps have developed from mural art, graffiti art and street art. The spray can is the central media of 3Steps. The collective paints huge images on facades and murals, as well as several kinds of paintable media in the studio. Bright colors and the reflection of a modern society express the intention of the three friends.\n\nIn November 2014 3Steps received the “Kultur- und Kreativpilot Deutschland” (Cultural and Creative pilot of Germany) award by the Federal Republic of Germany. 3Steps lives and works in the University town of Giessen, central Germany.\n\nDevelopment\n3Steps was founded in fall 1998. The style of the collective quickly developed from the classic “New York Style Writing” graffiti and high art graffiti to artistic short stories on large-scale murals.

In [20]:
df_tmp.at[0,'categories']

"['Category:1374 births', 'Category:1405 deaths', 'Category:All articles needing additional references', 'Category:All stub articles', 'Category:Articles needing additional references from July 2016', 'Category:Asian painter stubs', 'Category:Commons category link is on Wikidata', 'Category:Iranian artist stubs', 'Category:Persian miniature painters', 'Category:Wikipedia articles with ULAN identifiers', 'Category:Wikipedia articles with VIAF identifiers', 'Category:Wikipedia articles with WORLDCATID identifiers']"

In [31]:
re.search(str(int(df_tmp.at[1,'birth'])),wiki_wiki.page(df_tmp.at[1,'page name']).text).start()

47

In [1]:
import json

In [5]:
with open('output.json', 'w',encoding='utf-16') as f:
    for j,row in enumerate(df_tmp.iterrows()):
        d = {}
        e = {}
        d['docid'] = "Wikipedia-"+str(j).zfill(5)
        d['doctext'] = wiki_wiki.page(row[1]['page name']).summary
        if np.isnan(row[1]['birth']):
            e['birth'] = []
        else:
            b = re.search(str(int(row[1]['birth'])),d['doctext'])
            if b is not None:
                e['birth'] = [[str(int(row[1]['birth'])),b.start()]]
            else:
                e['birth'] = []
         
        if np.isnan(row[1]['death']):
            e['death'] = []
        else:
            de = re.search(str(int(row[1]['death'])),d['doctext'])
            if de is not None:
                e['death'] = [[str(int(row[1]['death'])),de.start()]]
            else:
                e['death'] = []
            
        l = row[1]['nationality']
        l = l.strip('][').split(", ")
        l_out = []
        for i in l: 
            i = i.strip("'")
            n = re.search(i,d['doctext'])
            if n is not None: 
                l_out.append([i,n.start()])
        e['nationality'] = l_out
        d["extracts"] = e            
        json.dump(d,f,ensure_ascii=False)
        f.write("\n")
        if j%100 == 0:
            print(j)
        

NameError: name 'wiki_wiki' is not defined

In [11]:
df_tmp = pd.read_csv('Data_GoogleArts_V1_0_21719.csv', index_col=0)
names = df_tmp['painter name'].to_numpy()
births = df_tmp['birth'].to_numpy()
deaths = df_tmp['death'].to_numpy()
nationalities = df_tmp['nationality'].to_numpy()
g = open('train.json','w',encoding='utf-8')
dev = open('dev.json','w',encoding='utf-8')
test = open('test.json','w',encoding='utf-8')
m = len(l)
a = np.array([0]*int(0.95*m)+[1]*int(0.05*m)+[2]*int(0.05*m))
np.random.shuffle(a)
with open('output.json', 'r',encoding='utf-16') as f:
    c = 0
    for j,i in enumerate(f):
        name  = names[j]
        birth = births[j]
        death = deaths[j]
        nationality = nationalities[j]
        c+=1
        d = json.loads(i)
        for k in d['extracts'].keys():
            if d['extracts'][k]:
                d['extracts'][k] = [d['extracts'][k]]
        n = re.search(name,d['doctext'])
        d["docid"] = "Wikipedia-"+d["docid"].split("-")[-1]
        if n is not None:
            d['extracts']['painter_name'] = [[[name,n.start()]]]
        else:
            d['extracts']['painter_name'] = [[[name,0]]]
        
        if not d['extracts']['birth'] and not np.isnan(birth):
            d['extracts']['birth'] = [[[str(int(birth)),0]]]
    
        if not d['extracts']['death'] and not np.isnan(death):
            d['extracts']['death'] = [[[str(int(death)),0]]]    
        
        if not d['extracts']['nationality']:
            d['extracts']['nationality'] = [[[ind.strip("'"),0] for ind in nationality.strip('][').split(", ")]]
        else:
            for nat in nationality.strip('][').split(", "):
                if not nat.strip("'") in [ind for ind,_ in d['extracts']['nationality'][0]]:
                    print("cc",d["docid"])
                    d['extracts']['nationality'][0].append([nat.strip("'"),0]) 

        di = {}
        di['1_painter_name'] = d['extracts']['painter_name']
        di['2_birth'] = d['extracts']['birth']
        di['3_death'] = d['extracts']['death']
        di['4_nationality'] = d['extracts']['nationality']
        d['extracts'] = di
        if c>=m:
            break
        if a[c]==0:
            json.dump(d,g,ensure_ascii=False)
            g.write("\n")
        elif a[c]==1:
            json.dump(d,dev,ensure_ascii=False)
            dev.write("\n")
        else:
            json.dump(d,test,ensure_ascii=False)
            test.write("\n")
            
g.close()
dev.close()
test.close()

cc Wikipedia-00000
cc Wikipedia-00057
cc Wikipedia-00057
cc Wikipedia-00057
cc Wikipedia-00058
cc Wikipedia-00062
cc Wikipedia-00071
cc Wikipedia-00102
cc Wikipedia-00119
cc Wikipedia-00132
cc Wikipedia-00137
cc Wikipedia-00169
cc Wikipedia-00169
cc Wikipedia-00175
cc Wikipedia-00203
cc Wikipedia-00225
cc Wikipedia-00261
cc Wikipedia-00263
cc Wikipedia-00283
cc Wikipedia-00297
cc Wikipedia-00358
cc Wikipedia-00393
cc Wikipedia-00429
cc Wikipedia-00429
cc Wikipedia-00503
cc Wikipedia-00529
cc Wikipedia-00555
cc Wikipedia-00555
cc Wikipedia-00581
cc Wikipedia-00608
cc Wikipedia-00616
cc Wikipedia-00624
cc Wikipedia-00651
cc Wikipedia-00679
cc Wikipedia-00788
cc Wikipedia-00826
cc Wikipedia-00832
cc Wikipedia-00847
cc Wikipedia-00847
cc Wikipedia-00878
cc Wikipedia-00951
cc Wikipedia-00967
cc Wikipedia-00975
cc Wikipedia-00977
cc Wikipedia-00980
cc Wikipedia-00997
cc Wikipedia-00998
cc Wikipedia-01036
cc Wikipedia-01038
cc Wikipedia-01045
cc Wikipedia-01047
cc Wikipedia-01066
cc Wikipedia

cc Wikipedia-11183
cc Wikipedia-11186
cc Wikipedia-11191
cc Wikipedia-11220
cc Wikipedia-11227
cc Wikipedia-11241
cc Wikipedia-11257
cc Wikipedia-11260
cc Wikipedia-11260
cc Wikipedia-11260
cc Wikipedia-11281
cc Wikipedia-11300
cc Wikipedia-11308
cc Wikipedia-11342
cc Wikipedia-11353
cc Wikipedia-11397
cc Wikipedia-11397
cc Wikipedia-11410
cc Wikipedia-11410
cc Wikipedia-11426
cc Wikipedia-11438
cc Wikipedia-11445
cc Wikipedia-11465
cc Wikipedia-11490
cc Wikipedia-11497
cc Wikipedia-11547
cc Wikipedia-11553
cc Wikipedia-11603
cc Wikipedia-11604
cc Wikipedia-11607
cc Wikipedia-11608
cc Wikipedia-11618
cc Wikipedia-11643
cc Wikipedia-11684
cc Wikipedia-11696
cc Wikipedia-11733
cc Wikipedia-11764
cc Wikipedia-11766
cc Wikipedia-11768
cc Wikipedia-11800
cc Wikipedia-11825
cc Wikipedia-11826
cc Wikipedia-11845
cc Wikipedia-11856
cc Wikipedia-11858
cc Wikipedia-11864
cc Wikipedia-11871
cc Wikipedia-11876
cc Wikipedia-11931
cc Wikipedia-12002
cc Wikipedia-12013
cc Wikipedia-12060
cc Wikipedia

cc Wikipedia-19529
cc Wikipedia-19541
cc Wikipedia-19569
cc Wikipedia-19582
cc Wikipedia-19610
cc Wikipedia-19640
cc Wikipedia-19640
cc Wikipedia-19669
cc Wikipedia-19872
cc Wikipedia-19958
cc Wikipedia-19974
cc Wikipedia-19993
cc Wikipedia-19995
cc Wikipedia-20098
cc Wikipedia-20105
cc Wikipedia-20143
cc Wikipedia-20163
cc Wikipedia-20224
cc Wikipedia-20234
cc Wikipedia-20255
cc Wikipedia-20273
cc Wikipedia-20333
cc Wikipedia-20333
cc Wikipedia-20368
cc Wikipedia-20403
cc Wikipedia-20591
cc Wikipedia-20608
cc Wikipedia-20654
cc Wikipedia-20694
cc Wikipedia-20696
cc Wikipedia-20707
cc Wikipedia-20747
cc Wikipedia-20747
cc Wikipedia-20747
cc Wikipedia-20750
cc Wikipedia-20797
cc Wikipedia-20867
cc Wikipedia-20874
cc Wikipedia-20877
cc Wikipedia-20883
cc Wikipedia-20904
cc Wikipedia-21020
cc Wikipedia-21020
cc Wikipedia-21033
cc Wikipedia-21048
cc Wikipedia-21110
cc Wikipedia-21130
cc Wikipedia-21132
cc Wikipedia-21161
cc Wikipedia-21166
cc Wikipedia-21180
cc Wikipedia-21183
cc Wikipedia

In [8]:
[i for i in ['Malaysian']]

['Malaysian']

In [15]:
l_nationality = []
with open('train.json','r',encoding='utf-8') as f:
    for line in f:
        d = json.loads(line)
        n = d['extracts']['nationality']
        if n:
            if not n[0][0][0] in l_nationality:
                l_nationality.append(n[0][0][0])

In [37]:
preds = []
with open('test.json','r',encoding='utf-8') as f:
    for line in f:
        d = json.loads(line)
        t = d["doctext"].split(" ")
        name = []
        n_c = True
        nats = []
        birth = 0
        death = 0
        for word in t:
            if word:
                if n_c:
                    if not (word in ["is","was"] or "(" in word):
                        name.append(word)
                    else:
                        n_c = False

                if not n_c:
                    if word[0] == "(":
                        word = word[1:]
                    if word and word[-1] == ")":
                        word = word[:-1]
                    if word in l_nationality and not word in nats:
                        nats.append(word)
                    if word.isnumeric() and len(word)==4:
                        if birth == 0:
                            birth = int(word)
                        elif death == 0:
                            death = int(word)
        pred = {}
        pred["name"] = " ".join(name)
        pred["nationality"] = nats
        pred["birth"] = birth
        pred["death"] = death
        r = 0
        tp = 0
        fp = 0
        if d["extracts"]["painter_name"]:
            r +=1
            if d["extracts"]["painter_name"][0][0][0] in pred["name"] or pred["name"] in d["extracts"]["painter_name"][0][0][0]:
                u =  len(pred["name"])/len(d["extracts"]["painter_name"][0][0][0])
                if u>1:
                    tp+=1/u
                else:
                    tp +=u
        else:
            if pred["name"]:
                fp += 1 
        if d["extracts"]["nationality"]:
            r +=len(d["extracts"]["nationality"][0])
            wolo = []
            for i in range(len(d["extracts"]["nationality"][0])):
                wolo.append(d["extracts"]["nationality"][0][i][0])
                if d["extracts"]["nationality"][0][i][0] in pred["nationality"]:
                    tp+=1
            fp+=max(0,len(pred["nationality"])-len(d["extracts"]["nationality"][0][i]))
        else:
            fp += len(pred["nationality"])
        if d["extracts"]["birth"]:
            r+=1
            if int(d["extracts"]["birth"][0][0][0]) == int(pred["birth"]):
                tp+=1
        elif not pred["birth"] == 0:
            fp +=1
        
        if d["extracts"]["death"]:
            r+=1
            if int(d["extracts"]["death"][0][0][0]) == int(pred["death"]):
                tp+=1
        elif not pred["death"] == 0:
            fp +=1
        fn = r-tp
        pred["prec"] = [tp,fn,fp,tp/(tp+0.5*(fp+fn))]
        preds.append(pred)   
            
            
        

In [39]:
f1_i = 0
f1_cum = 0
tp = 0 
fn = 0 
fp = 0
N = len(preds)
for i in preds:
    f1_i += pred["prec"][-1]
    a , b , c = pred["prec"][:-1]
    tp +=  a
    fn +=  b
    fp +=  c
f1_i/=N
f1_cum = tp/(tp+0.5*(fp+fn))

In [40]:
print(f1_i,f1_cum)

0.6666666666666623 0.6666666666666666
