In [13]:
import re

file_path = './archive/EMEL_output.txt'

# Define regex patterns for English and French words
english_pattern = re.compile(r'^[a-zA-Z]+$')  # English words (only letters)
french_pattern = re.compile(r'^[a-zA-Zàâçéèêëîïôûùÿñ]+$')  # French words (letters including accents)

english_words = []
french_words = []

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip().split() for line in file]
    
    # Iterate through the lines and filter words
    for sentence in lines:
        for word in sentence:
            if english_pattern.match(word):  # Check if the word is English
                english_words.append(word)
            elif french_pattern.match(word):  # Check if the word is French
                french_words.append(word)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

import string
from nltk.corpus import stopwords

def is_english(word):
    return bool(re.match(r'^[a-zA-Z]+$', word))

def is_french(word):
    return bool(re.match(r'^[àâçéèêëîïôûù]+|[a-zA-Z]+$', word))

# Ensure you have NLTK stopwords downloaded
import nltk
nltk.download('stopwords')

# File path
file_path = './archive/EMEL_output.txt'

# Load the stop words
english_stopwords = set(stopwords.words('english'))
french_stopwords = set(stopwords.words('french'))

# Read the file
with open(file_path, 'r', encoding='utf-8') as file:
    lines = [line.split() for line in file]

# Initialize lists for filtered words
filtered_english = []
filtered_french = []

# Define a function to clean words
def clean_word(word, stopwords_set):
    # Remove punctuation and convert to lowercase
    word = re.sub(f'[{string.punctuation}]', '', word).lower()
    # Check if the word is not a stop word and not empty
    if word and word not in stopwords_set:
        return word
    return None

# Process each line
for sentence in lines:
    for word in sentence:
        if is_english(word):  # Implement this function to check if the word is English
            cleaned_word = clean_word(word, english_stopwords)
            if cleaned_word:
                filtered_english.append(cleaned_word)
        elif is_french(word):  # Implement this function to check if the word is French
            cleaned_word = clean_word(word, french_stopwords)
            if cleaned_word:
                filtered_french.append(cleaned_word)

# Print the results
print("Filtered English Words:", filtered_english)
print("Filtered French Words:", filtered_french)

Filtered English Words: ['elimination', 'lexical', 'ambiguities', 'modern', 'presentation', 'elag', 'system', 'notre', 'dans', 'le', 'cadre', 'de', 'des', 'entrepris', 'en', 'collaboration', 'entre', 'les', 'laboratoires', 'de', 'linguistique', 'informatique', 'de', 'aristote', 'de', 'thessaloniki', 'et', 'de', 'de', 'visant', 'une', 'description', 'et', 'de', 'la', 'langue', 'nous', 'le', 'lexicales', 'elag', 'et', 'mis', 'jour', 'et', 'automatique', 'de', 'textes', 'le', 'programme', 'la', 'sur', 'du', 'texte', 'en', 'utilisant', 'des', 'grammaires', 'qui', 'imposent', 'des', 'contraintes', 'grammaticales', 'sur', 'les', 'mots', 'en', 'fonction', 'de', 'leur', 'contexte', 'dans', 'les', 'initialement', 'pour', 'le', 'le', 'programme', 'pour', 'pouvoir', 'avec', 'des', 'langues', 'dans', 'unitex', 'nous', 'exposons', 'le', 'formalisme', 'permettant', 'les', 'contraintes', 'grammaticales', 'par', 'de', 'graphes', 'unitex', 'et', 'nous', 'les', 'de', 'ces', 'grammaires', 'de', 'dans', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shaunak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import gensim.downloader as api

english_model = api.load("word2vec-google-news-300")
french_model = api.load("fasttext-wiki-news-subwords-300")

english_vectors = {}
for word in filtered_english:
    try:
        english_vectors[word] = english_model[word]
    except KeyError:
        print(f"Word '{word}' not found in English model.")

# Get word vectors for filtered French words
french_vectors = {}
for word in filtered_french:
    try:
        french_vectors[word] = french_model[word]  # Get the word vector
    except KeyError:
        print(f"Word '{word}' not found in French model.")

print("English Vectors:")
for word, vector in english_vectors.items():
    print(f"{word}: {vector[:5]}...")

print("\nFrench Vectors:")
for word, vector in french_vectors.items():
    print(f"{word}: {vector[:5]}...")

[--------------------------------------------------] 0.1% 2.1/1662.8MB downloaded

KeyboardInterrupt: 