# Romanian to Sign Language

#### Imports

In [None]:
import aiohttp
import asyncio
import concurrent
import cv2
import os
import mediapipe as mp
import numpy as np
import re
import requests
import ssl
import time

from aiohttp import TCPConnector
from bs4 import BeautifulSoup
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from natsort import os_sorted

#### Globals

In [None]:
# USB webcam
cam_idx = 0

# Recording is done on Ubuntu
api_pref = cv2.CAP_V4L2

# Target frame resolution
width = 640
height = 480

scraped_dataset_path = os.path.join('..', 'scraped', 'pesemne')
output_path = os.path.join('..', 'scraped', 'pesemne_processed')
landmarks_path = os.path.join('assets', 'landmarks')


hands_base_options = python.BaseOptions(model_asset_path=os.path.join('assets', 'mp_models', 'hand_landmarker.task'))
hands_options = vision.HandLandmarkerOptions(base_options=hands_base_options,
                                       running_mode=mp.tasks.vision.RunningMode.VIDEO,
                                       num_hands=2,
                                       min_hand_detection_confidence=0.75,
                                       min_hand_presence_confidence=0.75,
                                       min_tracking_confidence=0.75)

pose_base_options = python.BaseOptions(model_asset_path=os.path.join('assets', 'mp_models', 'pose_landmarker_full.task'))
pose_options = vision.PoseLandmarkerOptions(base_options=pose_base_options,
                                       running_mode=mp.tasks.vision.RunningMode.VIDEO,
                                       min_pose_detection_confidence=0.5,
                                       min_pose_presence_confidence=0.5,
                                       min_tracking_confidence=0.5)

#### Helper functions

In [None]:
def get_pose_landmarks(results):
    pose = None

    for pose_landmarks in results.pose_landmarks:
        pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        pose_landmarks_proto.landmark.extend(
            [landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z, visibility=landmark.visibility) for landmark in pose_landmarks]
            )
        
        pose = pose_landmarks_proto

    return pose


def get_hands_landmarks(results):
    left = [None]
    right = [None]

    # handedness list and hand_landmarks list match by index
    for handedness, hand_landmarks in zip(results.handedness, results.hand_landmarks):
        handedness = handedness[0]

        hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        hand_landmarks_proto.landmark.extend(
            [landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks]
            )
        
        # Assume we are pointing the camera to a person with one left and one right hand.
        # That means that the only correct predicted handednesses are { 'Left' AND 'Right' }

        # If predicted handedness is left
        if handedness.category_name == 'Left':
            # If left was NOT predicted before
            if left[0] is None:
                # Just assign the landmarks and handedness probability
                left = [hand_landmarks_proto, handedness.score]
            # Else, if we have a collision
            else:
                # If current prediction is more accurate:
                if handedness.score > left[1]:
                    # That means that the previous left value is actually right
                    right = left
                    left = [hand_landmarks_proto, handedness.score]
                # Else, if current prediction is less accurate
                else:
                    # That means that the current prediction is actually right
                    right = [hand_landmarks_proto, handedness.score]
        
        # Same exact thing for right
        if handedness.category_name == 'Right':
            if right[0] is None:
                right = [hand_landmarks_proto, handedness.score]
            else:
                if handedness.score > right[1]:
                    left = right
                    right = [hand_landmarks_proto, handedness.score]
                else:
                    left = [hand_landmarks_proto, handedness.score]

    return left[0], right[0]


def draw_results(frame, pose, left_hand, right_hand):    
    mp.solutions.drawing_utils.draw_landmarks(
        frame,
        pose,
        mp.solutions.holistic.POSE_CONNECTIONS,
        landmark_drawing_spec=mp.solutions.drawing_styles
            .get_default_pose_landmarks_style())
    
    mp.solutions.drawing_utils.draw_landmarks(
        frame,
        left_hand,
        mp.solutions.holistic.HAND_CONNECTIONS,
        landmark_drawing_spec=mp.solutions.drawing_styles
            .get_default_hand_landmarks_style(),
        connection_drawing_spec=mp.solutions.drawing_styles
            .get_default_hand_connections_style())
    
    mp.solutions.drawing_utils.draw_landmarks(
        frame,
        right_hand,
        mp.solutions.holistic.HAND_CONNECTIONS,
        landmark_drawing_spec=mp.solutions.drawing_styles
            .get_default_hand_landmarks_style(),
        connection_drawing_spec=mp.solutions.drawing_styles
            .get_default_hand_connections_style())
    
    
def to_results_dict(pose, left_hand, right_hand):
    pose = [
        [[p.x, p.y, p.z] for p in r.landmark] if r is not None else [[0] * 3 for _ in range(33)]
        for r in pose
    ]

    left_hand = [
        [[h.x, h.y, h.z] for h in r.landmark] if r is not None else [[0] * 3 for _ in range(21)]
        for r in left_hand
    ]

    right_hand = [
        [[h.x, h.y, h.z] for h in r.landmark] if r is not None else [[0] * 3 for _ in range(21)]
        for r in right_hand
    ]
    
    return {
        'pose': pose,
        'left_hand': left_hand,
        'right_hand': right_hand
        }

## Process Videos

In [None]:
def process_video(video_path):
    video_name = video_path.split(os.sep)[-1].split('.')[0]

    cap = cv2.VideoCapture(video_path)

    fps = int(round(cap.get(cv2.CAP_PROP_FPS)))
    w  = int(round(cap.get(cv2.CAP_PROP_FRAME_WIDTH)))
    h = int(round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))

    out = cv2.VideoWriter(os.path.join(output_path, 'videos', f'{video_name}__fps{fps}.mp4'), cv2.VideoWriter_fourcc(*'MP4V'), fps, (w, h))

    pose = []
    left_hand = []
    right_hand = []

    hands_model = vision.HandLandmarker.create_from_options(hands_options)
    pose_model = vision.PoseLandmarker.create_from_options(pose_options)

    while cap.isOpened():
        res, frame = cap.read()

        if not res:
            break

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame.flags.writeable = False

        t = int(time.time() * 1000)

        pose_results = pose_model.detect_for_video(mp.Image(image_format=mp.ImageFormat.SRGB, data=frame), t)
        hands_results = hands_model.detect_for_video(mp.Image(image_format=mp.ImageFormat.SRGB, data=frame), t)

        pose_landmarks = get_pose_landmarks(pose_results)
        pose.append(pose_landmarks)

        left_hand_landmarks, right_hand_landmarks = get_hands_landmarks(hands_results)
        left_hand.append(left_hand_landmarks)
        right_hand.append(right_hand_landmarks)

        frame.flags.writeable = True
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        draw_results(frame, pose_landmarks, left_hand_landmarks, right_hand_landmarks)

        out.write(frame)
        
    cap.release()
    out.release()

    lm = to_results_dict(pose, left_hand, right_hand)
    lm = {
        key: np.array(value) for key, value in lm.items()
    }

    # We want to save the landmarks for left hand, right hand and pose,
    # each having shape like:   (N, 21, 3),(N, 21, 3),    (N, 33, 3)
    # Because they don't have the same shape, we pad left hand and right hand
    # so we can save them all as a ndarray.
    padding = ((0, 0), (0, lm['pose'].shape[1] - lm['left_hand'].shape[1]), (0, 0))
    lm['left_hand'] = np.pad(lm['left_hand'], padding, 'constant', constant_values=0)
    
    padding = ((0, 0), (0, lm['pose'].shape[1] - lm['right_hand'].shape[1]), (0, 0))
    lm['right_hand'] = np.pad(lm['right_hand'], padding, 'constant', constant_values=0)
    
    lm = np.array([lm['pose'], lm['left_hand'], lm['right_hand']], np.float32)
    np.save(os.path.join(output_path, 'landmarks', f'{video_name}__fps{fps}.npy'), lm)

In [None]:
def get_full_path(video_name):
    pattern = r'__fps\d+'
    video_name = re.sub(pattern, '', video_name)
    return os.path.join(scraped_dataset_path, video_name[0].upper(), video_name)


scraped_videos = [os.path.join(root, file) for root, _, files in os.walk(scraped_dataset_path) for file in files]

processed_video_names = os.listdir(os.path.join(output_path, 'videos'))
processed_videos = [get_full_path(n) for n in processed_video_names]

to_be_processed = os_sorted(set(scraped_videos).difference(set(processed_videos)))


with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(process_video, video_path) for video_path in to_be_processed]

    for future in concurrent.futures.as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f'Exception: {e}')

## Upload files to server

In [None]:
url = 'https://192.168.0.152:45455/File/'
token = ''
headers = {
    'Authorization': f'Bearer {token}',
}

ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

async def upload_file(session, directory, file_path):
    mpwriter = aiohttp.MultipartWriter('form-data')
    
    f = open(file_path, 'rb')

    try:
        part = mpwriter.append(f)
        file_name = os.path.basename(file_path)
        part.headers['Content-Disposition'] = f'form-data; name="file"; filename="{file_name}"'
        
        async with session.post(url + directory, data=mpwriter, headers=headers, ssl=ssl_context) as response:
            if not response.ok:
                return file_path
            return None
    
    except:
        return file_path

    finally:
        f.close()


async def upload_files(directory, files):
    connector = aiohttp.TCPConnector(ssl=ssl_context)

    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [upload_file(session, directory, file) for file in files]
        results = await asyncio.gather(*tasks)

        return [result for result in results if result is not None]


async def upload_landmarks():
    landmarks_dir = os.path.join('assets', 'landmarks')
    files = [os.path.join(landmarks_dir, f) for f in ['casă__fps30.npy', 'acasă__fps30.npy']]
    fails = await upload_files(files)
    print(fails)


async def upload_models():
    hand_model = os.path.join('assets', 'mp_models', 'hand_landmarker.task')
    pose_model = os.path.join('assets', 'mp_models', 'pose_landmarker_lite.task')
    gesture_model = os.path.join('..', 'gestures.tflite')

    fails = await upload_files('model', [hand_model, pose_model, gesture_model])
    print(fails)


async def upload_classes():
    fails = await upload_files('class', [os.path.join('..', 'gestures.json')])
    print(fails)


async def upload_chars():
    chars_dir = os.path.join('assets', 'chars')
    fails = await upload_files('char', [os.path.join(chars_dir, char) for char in os.listdir(chars_dir)])
    print(fails)


await upload_models()
await upload_classes()

## Kalman Filter

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

from library.helpers.landmarks_smoother import _filter_useless, _get_non_zero_indices, _process


landmark_dir = os.path.join('assets', 'landmarks', 'volan__fps25.npy')

frames = np.load(landmark_dir)
frames = _filter_useless(frames)

right_hand = frames[2]
start, stop = _get_non_zero_indices(right_hand)

right_hand_kalman = np.copy(right_hand)
_process(right_hand_kalman, 25, start, stop)

right_hand = right_hand[start:stop]
right_hand_kalman = right_hand_kalman[start:stop]

pointer = right_hand[:, 8, :]
pointer_kalman = right_hand_kalman[:, 8, :]

time = np.linspace(0, len(pointer) / 25, len(pointer))

x, y, z = zip(*pointer)
xk, yk, zk = zip(*pointer_kalman)

fig = plt.figure(figsize=(15, 5))

ax = fig.add_subplot(121, projection='3d')
ax.set_ylim([0, 0.6])
ax.set_zlim([0, 0.9])
ax.plot(time, x, y, c='r', marker='o')
ax.set_xlabel('Time (s)')
ax.set_ylabel('X')
ax.set_zlabel('Y')
ax.zaxis.labelpad=-0.8
plt.tight_layout()
plt.show()


fig = plt.figure(figsize=(15, 5))

ax = fig.add_subplot(121, projection='3d')
ax.set_ylim([0, 0.6])
ax.set_zlim([0, 0.9])
ax.plot(time, xk, yk, c='b', marker='s')
ax.set_xlabel('Time (s)')
ax.set_ylabel('X')
ax.set_zlabel('Y')
ax.zaxis.labelpad=-0.8
plt.tight_layout()
plt.show()

In [None]:
from library.helpers.filters.moving_average import moving_average_smooth

def plot_moving_average(fig, window_size, idx):
    frames_copy = np.copy(frames)
    right_hand = frames_copy[2]
    _process(right_hand, 25, start, stop)

    if window_size > 1:
        frames_moving_average = [moving_average_smooth(value, window_size) for value in frames_copy]
    else:
        frames_moving_average = frames_copy
    right_hand_moving_average = frames_moving_average[2]
    right_hand_moving_average = right_hand_moving_average[start:stop]
    pointer_moving_average = right_hand_moving_average[:, 8, :]

    xma, yma, zma = zip(*pointer_moving_average)

    ax = fig.add_subplot(4, 2, idx, projection='3d')
    ax.set_ylim([0, 0.6])
    ax.set_zlim([0, 0.9])
    ax.plot(time, xma, yma, c='b', marker='s')
    ax.set_xlabel('Time (s)')
    ax.set_ylabel('X')
    ax.set_zlabel('Y')
    if window_size > 1:
        ax.set_title(f'window_size={window_size}')
    else:
        ax.set_title(f'Original')
    ax.zaxis.labelpad=-0.8

In [None]:
fig = plt.figure(figsize=(11, 20))

plot_moving_average(fig, 1, 1)
plot_moving_average(fig, 3, 2)
plot_moving_average(fig, 5, 3)
plot_moving_average(fig, 7, 4)
plot_moving_average(fig, 9, 5)
plot_moving_average(fig, 11, 6)
plot_moving_average(fig, 13, 7)
plot_moving_average(fig, 15, 8)

plt.tight_layout()
plt.show()

## Web Scraping

#### Download all dlmg.ro videos

In [None]:
url_dlmg = 'https://dlmg.ro/dictionar/'
url_dlmg_words = 'https://dlmg.ro/ajax/?cat='

dir_dlmg = os.path.join('.', 'scraped')
if not os.path.exists(dir_dlmg):
    os.mkdir(dir_dlmg)
dir_dlmg = os.path.join(dir_dlmg, 'dmlg')
if not os.path.exists(dir_dlmg):
    os.mkdir(dir_dlmg)


def get_letters_id_map():
    letters_id_map = dict()

    response = requests.get(url_dlmg)
    if response.status_code != 200:
        raise('Failed to retrieve the webpage.')

    soup = BeautifulSoup(response.text, 'html.parser')
    div_content = soup.find('div', class_='dictionar-cat-wrapper')
    if not div_content:
        raise('No "dictionar-cat-wrapper" class found in the HTML.')

    anchors = div_content.find_all('a')
    for a in anchors:
        text = a.get_text()
        onclick_text = a.get('onclick', '')
        start = onclick_text.find("(") + 1
        end = onclick_text.find(")", start)
        id = onclick_text[start:end]
        
        letters_id_map[text] = id

    return letters_id_map


def get_words_url_letter_map(letters_id_map):
    words_url_letter_map = dict()

    for letter, id in letters_id_map.items():
        letter_path = os.path.join(dir_dlmg, letter)
        
        if not os.path.exists(letter_path):
            os.mkdir(letter_path)

        url_words = url_dlmg_words + id

        response = requests.get(url_words)
        if response.status_code != 200:
            print('Failed to retrieve the webpage for letter: ', letter)
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')
        anchors = soup.find_all('a')

        for anchor in anchors:
            a_text = anchor.get_text()
            if not a_text in words_url_letter_map:
                words_url_letter_map[a_text] = {'url': anchor['href'], 'letter': letter }

    return words_url_letter_map


async def download_video(session, word, url, letter):
    word_path = os.path.join(dir_dlmg, letter, word)

    try:
        async with session.get(url) as response:
            if response.status != 200:
                print('Failed to retrieve the webpage for video: ', word)
                return
            
            text = await response.text()
            soup = BeautifulSoup(text, 'html.parser')
            video_tag = soup.find('source', {'type': 'video/mp4'})

            if not video_tag or 'src' not in video_tag.attrs:
                print('Failed to retrieve the video URL for:', word)
                return

            video_url = video_tag['src']

            async with session.get(video_url) as video_response:
                if video_response.status != 200:
                    print('Failed to download video:', word)
                    return

                with open(word_path + '.mp4', 'wb') as file:
                    async for chunk in video_response.content.iter_chunked(1024 * 1024):
                        file.write(chunk)

    except Exception as e:
        print(f'An error occurred for {word}: {str(e)}')


async def download_videos(words_url_letter_map):
    connector = TCPConnector(limit=16)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = []
        for word, dictionary in words_url_letter_map.items():
            url = dictionary['url']
            letter = dictionary['letter']
            task = download_video(session, word, url, letter)
            tasks.append(task)
        
        await asyncio.gather(*tasks)


letters_id_map = get_letters_id_map()
words_url_letter_map = get_words_url_letter_map(letters_id_map)
await download_videos(words_url_letter_map)

#### Download all pesemne.ro videos

In [None]:
url_pesemne = 'https://pesemne.ro/'
url_pesemne_clips = url_pesemne + 'wp-content/uploads/clips/'

dir_pesemne = os.path.join('.', 'scraped')
if not os.path.exists(dir_pesemne):
    os.mkdir(dir_pesemne)
dir_pesemne = os.path.join(dir_pesemne, 'pesemne')
if not os.path.exists(dir_pesemne):
    os.mkdir(dir_pesemne)


response = requests.get(url_pesemne)
if response.status_code != 200:
    raise('Failed to retrieve the webpage.')

soup = BeautifulSoup(response.text, 'html.parser')
ul_element = soup.find('ul', id='manual-selection-list')
if not ul_element:
    raise ValueError('Failed to parse the webpage.')
li_elements = ul_element.find_all('li')

data_words = []
data_clips = []

for li in li_elements:
    try:
        data_word = li.get('data-word', None)
        if not data_word:
            raise ValueError()
        
        data_clip = li.get('data-clip', None)
        if not data_clip:
            raise ValueError()
        
        data_words.append(data_word)
        data_clips.append(data_clip)
    except:
        pass


async def download_video(session, data_word, data_clip):
    word_clean = data_word.removeprefix('a ')

    if word_clean == '':
        return

    letter_path = os.path.join(dir_pesemne, word_clean[0].upper())
    if not os.path.exists(letter_path):
        os.mkdir(letter_path)

    word_path = os.path.join(letter_path, word_clean)

    try:
        async with session.get(url_pesemne_clips + data_clip) as video_response:
            if video_response.status != 200:
                print('Failed to download video:', data_word)
                return

            with open(word_path + '.mp4', 'wb') as file:
                async for chunk in video_response.content.iter_chunked(1024 * 1024):
                    file.write(chunk)

    except Exception as e:
        print(f'An error occurred for {data_word}: {str(e)}')


async def download_videos():
    connector = TCPConnector(limit=16)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = []
        for data_word, data_clip in zip(data_words, data_clips):
            task = download_video(session, data_word, data_clip)
            tasks.append(task)
        
        await asyncio.gather(*tasks)


await download_videos()

## NLP

#### Sentence to lexemes

In [None]:
import Levenshtein as lev
import os
import spacy_stanza
import sys

from collections import Counter
from nltk.stem.snowball import SnowballStemmer


directory = landmarks_path


nlp = spacy_stanza.load_pipeline("ro")
stemmer = SnowballStemmer("romanian")


vocabulary = [file_name[:file_name.find('__fps')] for file_name in os.listdir(directory)]


def cosine_dist(a, b):
    a_vals = Counter(a)
    b_vals = Counter(b)

    chars = list(a_vals.keys() | b_vals.keys())
    a_vect = [a_vals.get(c, 0) for c in chars]
    b_vect = [b_vals.get(c, 0) for c in chars]

    len_a = sum(av * av for av in a_vect) ** 0.5
    len_b = sum(bv * bv for bv in b_vect) ** 0.5
    dot = sum(av * bv for av, bv in zip(a_vect, b_vect))
    cosine = dot / (len_a * len_b)

    return 1 - cosine


def find_closest_words(doc, is_end):
    doc_len = len(doc)

    def search_in_vocab(word):
        distance_map = dict()

        for vocab_word in vocabulary:
            max_len = max(1, len(word) // 2)
            start_a = word[:max_len]
            start_b = vocab_word[:max_len]

            if start_a == start_b:
                dist = lev.distance(vocab_word, word)
            else:
                dist = sys.maxsize
                
            distance_map[vocab_word] = dist
        
        min_dist = min(distance_map.values())
        
        candidates = [key for key, value in distance_map.items() if value == min_dist]
        candidates = sorted(candidates, key=lambda x: cosine_dist(word, x))

        for candidate in candidates:
            if min_dist < int(len(candidate) / 2):
                return candidate
        
        return None
    

    def process_token(token, idx):
        token_text = token.text.lower()
        token_lemma = token.lemma_.lower()
        token_stem = stemmer.stem(token.text).lower()

        # weirdness
        if token_text == 'm':
            if idx < doc_len - 1 and doc[idx + 1].pos_ == 'AUX':
                return ['eu']
            else:
                return ['m']

        elif token_text == 'mânc' or token_text == 'mănânc':
            return ['mânca']
            
        elif token_text == 'n':
            if idx < doc_len - 1 and (doc[idx + 1].pos_ == 'ADV' or doc[idx + 1].pos_ == 'NOUN'):
                return ['în']
            elif idx >= 1 and (doc[idx - 1].pos_ == 'PRON' or doc[idx - 1].pos_ == 'VERB'):
                return ['în']
            else:
                return ['n']
            
        elif token_text == 'ă':
            return ['ă']

        # connection words
        elif token_text == 'dar':
            if 'CONJ' in token.pos_:
                return ['dar (conjuncție)']
            else:
                return ['dar (cadou)']
            
        elif token_text == 'ori':
            if 'CONJ' in token.pos_:
                return ['sau']
            else:
                return ['ori']
            
        elif token_text == 'fie':
            if 'CONJ' in token.pos_:
                return ['sau']
            else:
                return ['fi']
            
        elif token_text == 'că':
            return []
            
        elif token_text == 'ci':
            return []
            
        elif token_text == 'de':
            if idx < doc_len - 1 and doc[idx + 1].text == 'ce':
                return []
            else:
                return ['de']
            
        elif token_text == 'ce':
            if idx >= 1 and doc[idx - 1].text == 'de':
                return ['de ce']
            else:
                return ['ce']
            
        elif token_text == 'la':
            if idx < doc_len - 1 and doc[idx + 1].text == 'revedere':
                return []
            else:
                return ['la']
            
        elif token_text == 'revedere':
            if idx >= 1 and doc[idx - 1].text == 'la':
                return ['la revedere']
            else:
                return ['vedea']
            
        elif token_text == 'deși':
            return ['chiar', 'dacă']
        
        elif token_text == 'da':
            if token.pos_ == 'ADV':
                return ['da (adverb)']
            else:
                return ['da (verb)']
            
        elif (token.pos_ == 'AUX' or token.pos_ == 'PART') and idx < doc_len - 1 and (doc[idx + 1].pos_ == 'VERB' or doc[idx + 1].pos_ == 'AUX' or doc[idx + 1].pos_ == 'PART'):
            return []
            
        elif token.pos_ == 'DET' and idx < doc_len - 1 and doc[idx + 1].pos_ == 'NUM':
            return []
        
        elif token_text == 'mai' and idx < doc_len - 1 and (doc[idx + 1].text.lower() == 'un' or doc[idx + 1].text.lower() == 'o'):
            return ['încă']

        # adjectives, nouns, verbs
        elif token_lemma == 'da':
            return ['da (verb)']
        
        elif token.pos_ == 'VERB':
            word = search_in_vocab(token_lemma)
            if word:
                return [word]
            word = search_in_vocab(token_stem)
            if word:
                return [word]
            else:
                return [x for x in token_lemma]
        
        elif token.pos_ == 'ADJ' or token.pos_ == 'NOUN':
            word = search_in_vocab(token_lemma)
            if word:
                return [word]
            word = search_in_vocab(token_text)
            if word:
                return [word]
            word = search_in_vocab(token_stem)
            if word:
                return [word]
            else:
                return [x for x in token_lemma]
            
        # proper name
        elif token.pos_ == 'PROPN':
            return [x for x in token_text]
            
        # numeral
        elif token.pos_ == 'NUM' or (token.pos_ == 'X' and any(char.isdigit() for char in token_text)):
            # only digits
            if '.' in token_text:
                token_text = token_text.replace('.', '')
                if ',' in token_text:
                    split = token_text.split(',')
                    res = []
                    res.extend([x for x in split[0]])
                    res.append('virgulă')
                    res.extend([x for x in split[1]])
                    return res

            # from 1M up, there are digits and letter
            # (e.g. un milion, 100 (de) milioane)
            word = search_in_vocab(token_text)
            if word:
                return [word]
            word = search_in_vocab(token_lemma)
            if word:
                return [word]
            else:
                return [x for x in token_text]
            
        # pronoun:
        elif token.pos_ == "PRON":
            if token_text == 'eu' or token_text == 'mine' or token_text == 'mă' or token_text == 'mie' or token_text == 'îmi' or token_text == 'mi':
                return ['eu']
            elif token_text == 'tu' or token_text == 'tine' or token_text == 'te' or token_text == 'ție' or token_text == 'îți' or token_text == 'ți':
                return ['tu']
            elif token_text == 'el' or token_text == 'îl' or token_text == 'l' or token_text == 'lui' or token_text == 'îi' or token_text == 'i':
                return ['el']
            elif token_text == 'ea' or token_text == 'o':
                return ['ea']
            elif token_text == 'noi' or token_text == 'ne' or token_text == 'nouă' or token_text == 'ni':
                return ['noi']
            elif token_text == 'voi' or token_text == 'vă' or token_text == 'vouă' or token_text == 'vi':
                return ['voi']
            elif token_text == 'ei' or token_text == 'îi' or token_text == 'i' or token_text == 'lor' or token_text == 'le' or token_text == 'li':
                return ['ei']
            elif token_text == 'ele' or token_text == 'le':
                return ['ele']
            elif token_text == 'unul' or token_text == 'una':
                return ['un']
            else:
                word = search_in_vocab(token_lemma)
                if word:
                    return [word]
                else:
                    return []
            
        # determiner:
        elif token.pos_ == "DET":
            if token_text == 'un' or token_text == 'o':
                return ['un']
            elif token_text == 'meu' or token_text == 'mea' or token_text == 'mei' or token_text == 'mele':
                return ['meu']
            elif token_text == 'tău' or token_text == 'ta':
                return ['tău']
            elif token_text == 'lui':
                return ['el']
            elif token_text == 'ei':
                return ['ea']
            elif token_text == 'său' or token_text == 'sa':
                return ['său']
            elif token_text == 'nostru':
                return ['nostru']
            elif token_text == 'vostru':
                return ['vostru']
            elif token_text == 'lor':
                return ['ei']
            else:
                word = search_in_vocab(token_lemma)
                if word:
                    return [word]
                else:
                    return []

        else:
            word = search_in_vocab(token_lemma)
            if word:
                return [word]
            else:
                return []

    if is_end:
        res_0 = process_token(doc[-3], doc_len - 3)
        res_1 = process_token(doc[-2], doc_len - 2)
        res_2 = process_token(doc[-1], doc_len - 1)
        return res_0, res_1, res_2
    else:
        res_0 = process_token(doc[-3], doc_len - 3)
        return res_0

In [None]:
def sentence_to_lexemes(sentence):
    doc = nlp(sentence)
    doc_len = len(doc)

    while doc_len < 3:
        sentence += " ."
        doc = nlp(sentence)
        doc_len = len(doc)

    print(sentence)

    for i in range(doc_len + 1):
        if (i < 3):
            continue

        print(find_closest_words(doc[:i], i == doc_len))

In [None]:
sentence_to_lexemes("Deși era obosit, Ioan a continuat să lucreze până târziu în noapte pentru a finaliza raportul detaliat cerut de superiorii săi.")
print()

sentence_to_lexemes("Azi dimineață am alergat în parc și m-am bucurat de aerul proaspăt.")
print()

sentence_to_lexemes("Calul paște")
print()

sentence_to_lexemes("a b c d e f g h i j k l m n o p q r s t u v w x y z ă î ș ț â")
print()

sentence_to_lexemes("Mă doare capul")
print()

sentence_to_lexemes("Merg acasă")
print()

sentence_to_lexemes("Cam atât a fost")
print()

sentence_to_lexemes("Această propoziție este în limba română.")
print()

sentence_to_lexemes("A măsura este foarte bine.")
print()

sentence_to_lexemes("Măsurătoarea cuiva fără măsură este nulă.")
print()

sentence_to_lexemes("În alergarea sa, alergătorul aleargă foarte repede.")
print()

sentence_to_lexemes("În alergarea ei, alergătoarea aleargă foarte repede.")
print()

sentence_to_lexemes("În alergarea lui, alergătorul aleargă foarte repede.")
print()

sentence_to_lexemes("a înroși, înroșire, roșiatic, înroșit, roșeață, Marea Roșie, piei-roșii, pătlăgică-roșie, coadă-roșie, roșul")
print()

sentence_to_lexemes("mă numesc Radu")
print()

sentence_to_lexemes("În 1992 o casă costa 10 milioane de lei vechi.")
print()

sentence_to_lexemes("a mers acolo")
print()

sentence_to_lexemes("Ea este frumoasă")
print()

sentence_to_lexemes("Pe noi ne cheamă tatăl ei acasă.")
print()

sentence_to_lexemes("Cine e la ușă?")
print()

sentence_to_lexemes("Conduc din Arad in Timișoara.")
print()

sentence_to_lexemes("Una din noi va pleca.")
print()

sentence_to_lexemes("Unul din nou va pleca.")
print()

sentence_to_lexemes("Pentru că sunt frumos, mă uit in oglindă.")
print()

sentence_to_lexemes("Tu ești ca mine.")
print()

sentence_to_lexemes("Ești ori prost ori te prefaci.")
print()

sentence_to_lexemes("5 ori 5 egal 25")
print()

sentence_to_lexemes("Nu ăsta, ci celălalt.")
print()

sentence_to_lexemes("Ce ai zis?")
print()

sentence_to_lexemes("În timp ce vizitam clădirea, am căzut.")
print()

sentence_to_lexemes("Da, m-am dus acolo!")
print()

sentence_to_lexemes("Cândva am făcut și asta!")
print()

sentence_to_lexemes("Câți băieți ați fost acolo?")
print()

sentence_to_lexemes("A o da în bară uneori este normal")
print()

sentence_to_lexemes("Ei vor merge mâine.")
print()

sentence_to_lexemes("Vor merge mâine.")
print()

sentence_to_lexemes("Va merge mâine.")
print()

sentence_to_lexemes("123.456")
print()

sentence_to_lexemes("Ea se spală singură.")
print()

sentence_to_lexemes("Mă spăl singur.")
print()

sentence_to_lexemes("Mă spăl pe mine.")
print()

sentence_to_lexemes("Mă spăl")
print()

sentence_to_lexemes("O fata e aici.")
print()

sentence_to_lexemes("Acest om e aici.")
print()

sentence_to_lexemes("Acesta e aici.")
print()

sentence_to_lexemes("Ăsta e aici.")
print()

sentence_to_lexemes("Calul meu")
print()

sentence_to_lexemes("Am mers acolo")
print()

sentence_to_lexemes("M-am dus acolo")
print()

sentence_to_lexemes("Eu m-am dus acolo")
print()

sentence_to_lexemes("Cui i-ai luat cadoul?")
print()

sentence_to_lexemes("El este german.")
print()

sentence_to_lexemes("Dar de ce ai facut asta?")
print()

sentence_to_lexemes("Dar de ce și de unde ai facut asta?")
print()

sentence_to_lexemes("La anul și la revedere")
print()

sentence_to_lexemes('Mai o data')
print()

sentence_to_lexemes('Da o n colo')
print()

sentence_to_lexemes('Cine n spanac face asta')
print()

sentence_to_lexemes('Uită te n vas')
print()

sentence_to_lexemes('M am supărat')
print()

sentence_to_lexemes('M ai lovit')
print()