In [1]:
import pandas as pd 

In [2]:
import nltk
from nltk.corpus import wordnet as wn

In [17]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/prince/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/prince/nltk_data...


True

In [36]:
import requests
from bs4 import BeautifulSoup

url = "https://www.fluentin3months.com/french-cognates/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')


In [37]:
list_div = soup.find('div', {'class': 'entry-content'})
list_items = list_div.find_all('li')


In [41]:
french_cognates = []
for item in list_items:
    strong_tag = item.find('strong')
    if strong_tag:
        french_cognates.append(strong_tag.text)


In [44]:
french_cognates

['l’accélération',
 'action',
 'addiction',
 'ambition',
 'attention',
 'celebration',
 'champion',
 'communication',
 'condition',
 'conversation',
 'donation',
 'diction',
 'édition',
 'election',
 'equation',
 'emotion',
 'érosion',
 'fiction',
 'function',
 'génération',
 'infection',
 'l’information',
 'injection',
 'invasion',
 'legion',
 'motion',
 'notion',
 'nation',
 'omission',
 'option',
 'passion',
 'population',
 'présentation',
 'prononciation',
 'question',
 'ration',
 'reflection',
 'religion',
 'réservation',
 'région',
 'rotation',
 'session',
 'station',
 'solution',
 'transaction',
 'animal',
 'central',
 'national',
 'final',
 'international',
 'mental',
 'vertical',
 'adorable',
 'admirable',
 'applicable',
 'cable',
 'capable',
 'double',
 'durable',
 'favorable',
 'habitable',
 'incurable',
 'identifiable',
 'improbable',
 'inséparable',
 'justifiable',
 'notable',
 'recyclable',
 'respectable',
 'sociable',
 'table',
 'vulnérable',
 'horrible',
 'flexible',
 '

In [50]:
weights = pd.read_csv("./results/hlr.settles.acl16.learning_traces.13m.weights",delimiter='\t',skiprows=3,header=None)

In [60]:
weights.columns = ['lexemes','weights']

In [62]:
weights.head()

Unnamed: 0,lexemes,weights
0,es:desde/desde<pr>,0.1707
1,es:como/comer<vblex><pri><p1><sg>,0.1708
2,de:kinder/kind<n><nt><pl><nom>,0.1297
3,en:strawberry/strawberry<n><sg>,-0.0585
4,en:to/to<pr>,0.0324


In [64]:


# Function to split the column into four parts
def split_column(row):
    parts = row['lexemes'].split(':')
    row['lang'] = parts[0] 
    parts = parts[1].split('/')
    row['surface_form'] = parts[0]
    parts = parts[1].split('<')
    row['root'] = parts[0]
    row['tags'] = parts[1:]
    return row   

# Apply the function to the dataframe and drop the original column
df = weights.apply(split_column, axis=1).drop('lexemes', axis=1)




       weights lang surface_form        root                            tags
0       0.1707   es        desde       desde                           [pr>]
1       0.1708   es         como       comer        [vblex>, pri>, p1>, sg>]
2       0.1297   de       kinder        kind            [n>, nt>, pl>, nom>]
3      -0.0585   en   strawberry  strawberry                       [n>, sg>]
4       0.0324   en           to          to                           [pr>]
...        ...  ...          ...         ...                             ...
19274   0.0000   fr   conférence  conférence                   [n>, f>, sg>]
19275   0.0000   fr        liens        lien                   [n>, m>, pl>]
19276   0.0000   pt          foi          ir        [vblex>, ifi>, p3>, sg>]
19277   0.0000   de        <*sf>      heißen  [vblex>, pri>, *pers>, *numb>]
19278   0.0000   fr    compagnie   compagnie                   [n>, f>, sg>]

[19279 rows x 5 columns]


In [90]:
def handle_sf(row):
    if row.surface_form == '<*sf>':
        row.surface_form = row.root
    return row

In [92]:
df = df.apply(handle_sf,axis=1)

In [65]:
df.head()

Unnamed: 0,weights,lang,surface_form,root,tags
0,0.1707,es,desde,desde,[pr>]
1,0.1708,es,como,comer,"[vblex>, pri>, p1>, sg>]"
2,0.1297,de,kinder,kind,"[n>, nt>, pl>, nom>]"
3,-0.0585,en,strawberry,strawberry,"[n>, sg>]"
4,0.0324,en,to,to,[pr>]


In [94]:
french_weights = df[df.lang == 'fr']

In [100]:
french_lexemes = set(french_weights.surface_form) 

In [101]:
french_cognates = french_lexemes & set(french_cognates)

In [102]:
len(french_cognates)

82

In [103]:
french_non_cognates = french_lexemes - set(french_cognates)

In [104]:
len(french_non_cognates)

2654

In [107]:
length = df['surface_form'].apply(lambda x : len(x))

In [112]:
df[length < 6]

Unnamed: 0,weights,lang,surface_form,root,tags
0,0.1707,es,desde,desde,[pr>]
1,0.1708,es,como,comer,"[vblex>, pri>, p1>, sg>]"
4,0.0324,en,to,to,[pr>]
5,-0.0578,en,am,be,"[vbser>, pri>, p1>, sg>]"
6,0.1622,en,on,on,[pr>]
...,...,...,...,...,...
19257,0.0000,pt,banda,banda,"[n>, f>, sg>]"
19259,0.0000,de,feld,feld,"[n>, nt>, *numb>, *case>]"
19273,0.0000,pt,ajuda,ajudar,"[vblex>, pri>, p3>, sg>]"
19275,0.0000,fr,liens,lien,"[n>, m>, pl>]"


In [113]:
df[length >= 6]

Unnamed: 0,weights,lang,surface_form,root,tags
2,0.1297,de,kinder,kind,"[n>, nt>, pl>, nom>]"
3,-0.0585,en,strawberry,strawberry,"[n>, sg>]"
8,0.1148,en,elephants,elephant,"[n>, pl>]"
15,0.3490,en,thanks,thanks,[ij>]
20,0.2906,fr,poisson,poisson,"[n>, m>, sg>]"
...,...,...,...,...,...
19271,0.0000,it,pescare,pescare,"[vblex>, inf>]"
19272,0.0000,pt,conhecemos,conhecer,"[vblex>, pri>, p1>, pl>]"
19274,0.0000,fr,conférence,conférence,"[n>, f>, sg>]"
19277,0.0000,de,heißen,heißen,"[vblex>, pri>, *pers>, *numb>]"
