<a href="https://colab.research.google.com/github/s-fellner/BSA_Rolex/blob/main/Code/Cognates_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cognates dictionnary building

To build our own Cognate dictionnary, we made a query through BigQuery and then lemmatize them. This dictionnary will be used to detect cognates (French/English) in sentences, and build an additionnal feature to our train datas.

In [None]:
# Installing what is needed
!pip install spacy download fr
!spacy download fr_core_news_sm

Collecting download
  Downloading https://files.pythonhosted.org/packages/37/45/01e7455a9659528e77a414b222326d4c525796e4f571bbabcb2e0ff3d1f4/download-0.3.5-py3-none-any.whl
Collecting fr
  Downloading https://files.pythonhosted.org/packages/20/f9/c7fd6082d31fca3f9797f95ccf137ac6691ee9742e964e4483bbf480d13f/fr-3.0-py2.py3-none-any.whl
Installing collected packages: download, fr
Successfully installed download-0.3.5 fr-3.0
Collecting fr_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7MB)
[K     |████████████████████████████████| 14.7MB 5.8MB/s 
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-cp37-none-any.whl size=14727027 sha256=f788a3a04d66b5b7a08dc0c1b96529c3ab570be0a9d059344c87add4cc085f3b
  Stored in directory: /tmp/pip-ephem-wheel-c

In [None]:
# Loading every module needed :
import numpy as np
import pandas as pd
import re 
import string

In [None]:
# Import spacy for lemmatization
import spacy
sp = spacy.load('fr_core_news_sm')

In [None]:
# Setting credential for BigQuery
from google.colab import auth

auth.authenticate_user()
print("Authenticated")

In [None]:
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client(project="assignementsimon")

In [None]:
# Query (taken from our assignement of Big Scale)
q5 = """
WITH norm_fr AS (
  SELECT REGEXP_REPLACE(NORMALIZE(ngram, NFD), r"\pM", '') AS ngram, SUM(term_frequency) AS tf
  FROM bigquery-public-data.words.fre_1gram
  GROUP BY ngram
), norm_en AS (
  SELECT REGEXP_REPLACE(NORMALIZE(ngram, NFD), r"\pM", '') AS ngram, SUM(term_frequency) AS tf
  FROM bigquery-public-data.words.eng_gb_1gram
  GROUP BY ngram
)

SELECT norm_fr.ngram AS ngram, norm_fr.tf AS FR_term_frequency, norm_en.tf AS EN_term_frequency, ABS(norm_en.tf-norm_fr.tf) AS delta
FROM norm_fr
INNER JOIN norm_en ON norm_en.ngram = norm_fr.ngram
WHERE ABS(norm_en.tf-norm_fr.tf) < 1000000
ORDER BY EN_term_frequency DESC
LIMIT 500000
"""

query_job_5 = client.query(q5)
query_job_5 = query_job_5.to_dataframe()
query_job_5.to_csv(r'/content/COG.csv', index=False)

In [None]:
# Loading the raw dictionnary (response of the request above was saved in our git)
cog = pd.read_csv("https://raw.githubusercontent.com/s-fellner/BSA_Rolex/main/Data/Cognates.csv", sep=',')
cog = cog.drop(columns=['FR_term_frequency', 'EN_term_frequency',	'delta'])
cog = cog['ngram'].to_list()
cognates =[]

for word in cog:
  if str(word) != 'nan':
    if len(re.findall('[0-9]+', word)) == 0:
      cognates.append(word)

cognates

['main',
 'population',
 'centre',
 'services',
 'prince',
 'relations',
 'date',
 'questions',
 'nation',
 'impossible',
 'decision',
 'passage',
 'image',
 'causes',
 'protection',
 'consequence',
 'resistance',
 'continue',
 'classes',
 'police',
 'doctrine',
 'absence',
 'actions',
 'institutions',
 'opposition',
 'effort',
 'details',
 'u',
 'interpretation',
 'definition',
 'theatre',
 'composition',
 'substance',
 'credit',
 'conference',
 'determine',
 'transport',
 'extreme',
 'attitude',
 'fortune',
 'notion',
 'silence',
 'grave',
 'maximum',
 'procedure',
 'von',
 'representation',
 'joseph',
 'plans',
 'instrument',
 'reduction',
 'impression',
 'intention',
 'resolution',
 'publication',
 'decisions',
 'imagination',
 'images',
 'tend',
 'hotel',
 'satisfaction',
 'canada',
 'opinions',
 'preparation',
 'institution',
 'depend',
 'berlin',
 'vice',
 'perspective',
 'supreme',
 'instruments',
 'minimum',
 'phrase',
 'courage',
 'don',
 'extension',
 'discipline',
 'contrib

In [None]:
# Create a list of stopwords
stop_words = spacy.lang.fr.stop_words.STOP_WORDS

# Create a list of punctuation marks
punctuations = string.punctuation

def spacy_tokenizer(sentence):

  # Create token object, which is used to create documents with linguistic annotations.
  mytokens = sp(sentence)

  # Lemmatize each token and convert each token into lowercase
  mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

  # Remove stop words and punctuation
  mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

  mytokens = [word for word in mytokens if len(word)>2]

  # Return preprocessed list of tokens
  return mytokens

In [None]:
# Init
cognates_proc = []

# Iteration counter
x=0

# Save count
save=0

# Save iterator setting
seuil = 1000

# Lemmatize each raw cognate
for cog in cognates:
  print(cog)
  token = spacy_tokenizer(cog)
  print('', token)

  # If the lemma is not in the dictionnary, add it
  if token != [] and token[0] not in cognates_proc:
    print('+++')
    cognates_proc.append(token[0])

  # If the Save iterator setting is reached, automatically save it in our google drive (update)
  if save==seuil :
    cognatesdf = pd.DataFrame()
    cognatesdf['cognates'] = cognates_proc
    cognatesdf.to_csv('/content/drive/MyDrive/ProjetBSA/cognates_proc.csv', encoding='utf-8', index=False)
    save = 0
  x+=1
  save+=1
  
  print(x, ' itérations')

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
344075  itérations
blandos
 ['blando']
344076  itérations
continuee
 ['continuee']
+++
344077  itérations
granam
 ['granam']
+++
344078  itérations
awel
 ['awel']
+++
344079  itérations
voiska
 ['voiska']
+++
344080  itérations
italum
 ['italum']
+++
344081  itérations
jaeren
 ['jaeren']
+++
344082  itérations
predelli
 ['predelli']
+++
344083  itérations
excudent
 ['excudent']
+++
344084  itérations
gennem
 ['gennem']
+++
344085  itérations
chrysaniline
 ['chrysaniline']
+++
344086  itérations
dialectological
 ['dialectological']
+++
344087  itérations
artesanato
 ['artesanato']
+++
344088  itérations
habomai
 ['habomai']
+++
344089  itérations
vernments
 ['vernment']
344090  itérations
souerain
 ['souerain']
+++
344091  itérations
doncel
 ['doncel']
+++
344092  itérations
vespero
 ['vespero']
+++
344093  itérations
heathorn
 ['heathorn']
+++
344094  itérations
oliveria
 ['oliveria']
+++
344095

In [None]:
# Reformating the dictionnary
cognatesdf = pd.DataFrame()
cognatesdf['cognates'] = cognates_proc
cognatesdf.to_csv('/content/drive/MyDrive/ProjetBSA/cognates_proc.csv', encoding='utf-8', index=False)