<a href="https://colab.research.google.com/github/smargetic/Natural_Language_Processing/blob/main/Machine_Translation/Data_Intake_and_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#all translations have been obtained at tatoeba.org on July 26th, 2024

In [2]:
#data storage
import pandas as pd
import numpy as np

#show all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#visualize
from IPython.display import display

#pytorch
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

#tensorflow
import tensorflow as tf

#word/sentence processing
import re

In [3]:
#import files
def import_file(fileName, sep=None):
  df = pd.read_csv(fileName, sep=sep, on_bad_lines='warn').T.reset_index().T.reset_index(drop=True)
  return df

#display file with name
def get_file_and_disp(fileName, sep=None, stringName=""):
  print("\n" + stringName + ":")
  df = import_file(fileName, sep=sep)
  display(df.head())
  return df

#english to italian translation
df_eng_it = get_file_and_disp(fileName="Sentence pairs in English-Italian - 2024-07-26.tsv" ,sep='\t', stringName="English to Italian")



English to Italian:


Skipping line 484577: expected 4 fields, saw 8



Unnamed: 0,0,1,2,3
0,1276,Let's try something.,565618.0,Proviamo qualcosa!
1,1277,I have to go to sleep.,4369.0,Devo andare a dormire.
2,1277,I have to go to sleep.,2608468.0,Io devo andare a dormire.
3,1280,Today is June 18th and it is Muiriel's birthday!,383739.0,Oggi è il 18 giugno ed è il compleanno di Muir...
4,1280,Today is June 18th and it is Muiriel's birthday!,565612.0,Oggi è il 18 di giugno ed è il compleanno di M...


In [4]:
#double check that there are no null versions for these sentences
def count_remove_null(df, index=1, name=""):
  #count nulls
  nulls = df[index].isnull().sum()+ df[index].eq("").sum()
  print("Nulls in " + name + ": {}".format(nulls))

  #remove nulls
  if(nulls>0):
    df.dropna(subset=[index], inplace=True)
    df = df[df[index]!=""]
    df.reset_index(drop=True, inplace=True)

  return df

df_eng_it = count_remove_null(df_eng_it, index=1, name="English")
df_eng_it = count_remove_null(df_eng_it, index=3, name="Italian")

Nulls in English: 0
Nulls in Italian: 1


In [5]:
#rename columns
df_eng_it.columns = ['eng_id', 'eng_sentence', 'it_id', 'it_sentence']
df_eng_it.head()

Unnamed: 0,eng_id,eng_sentence,it_id,it_sentence
0,1276,Let's try something.,565618.0,Proviamo qualcosa!
1,1277,I have to go to sleep.,4369.0,Devo andare a dormire.
2,1277,I have to go to sleep.,2608468.0,Io devo andare a dormire.
3,1280,Today is June 18th and it is Muiriel's birthday!,383739.0,Oggi è il 18 giugno ed è il compleanno di Muir...
4,1280,Today is June 18th and it is Muiriel's birthday!,565612.0,Oggi è il 18 di giugno ed è il compleanno di M...


In [6]:
#seperates 1-1 translations
def one_to_one_translations(df):
  df_sing = df.copy().loc[~df.duplicated(subset='eng_sentence', keep=False), :]
  display(df.head())
  print('Orig size: {}'.format(df.shape[0]))
  print('Singular size: {}'.format(df_sing.shape[0]))

  return df_sing

df_eng_it_sing = one_to_one_translations(df_eng_it)

Unnamed: 0,eng_id,eng_sentence,it_id,it_sentence
0,1276,Let's try something.,565618.0,Proviamo qualcosa!
1,1277,I have to go to sleep.,4369.0,Devo andare a dormire.
2,1277,I have to go to sleep.,2608468.0,Io devo andare a dormire.
3,1280,Today is June 18th and it is Muiriel's birthday!,383739.0,Oggi è il 18 giugno ed è il compleanno di Muir...
4,1280,Today is June 18th and it is Muiriel's birthday!,565612.0,Oggi è il 18 di giugno ed è il compleanno di M...


Orig size: 627909
Singular size: 135813


In [120]:
##tokenize

def tokenize(sentence):
    return re.findall(r'\b\w+\b|[^\w\s]', sentence)

#apply to df
df_eng_it['eng_tokens'] = df_eng_it['eng_sentence'].apply(tokenize)
df_eng_it['it_tokens'] = df_eng_it['it_sentence'].apply(tokenize)

#get vocabulary as dictionary
def get_vocab(df, col):
  unique_tokens = np.unique(np.hstack(np.array(df[col])))
  vocab = {k: v for v, k in enumerate(unique_tokens)}
  return vocab

#apply to df
eng_vocab = get_vocab(df_eng_it_sing, 'eng_tokens')
it_vocab = get_vocab(df_eng_it_sing, 'it_tokens')

# #get vocabulary
# eng_token = np.unique(np.hstack(np.array(df_eng_it_sing['eng_tokens'])))
# it_token = np.unique(np.hstack(np.array(df_eng_it_sing['it_tokens'])))

# eng_vocab = {k: v for v, k in enumerate(eng_token)}
# it_vocab = {k: v for v, k in enumerate(it_token)}

print("\nEnglish vocabulary size: {}".format(len(eng_vocab)))
print("Italian vocabulary size: {}".format(len(it_vocab)))

#encode for vocab
def encode(tokens, vocab):
    return [vocab[token] for token in tokens]

df_eng_it_sing['eng_tokens_enc'] = df_eng_it_sing['eng_tokens'].apply(lambda x: encode(x, eng_vocab))
df_eng_it_sing['it_tokens_enc'] = df_eng_it_sing['it_tokens'].apply(lambda x: encode(x, it_vocab))

display(df_eng_it_sing.head())

Unnamed: 0,eng_id,eng_sentence,it_id,it_sentence,eng_tokens,it_tokens
0,1276,Let's try something.,565618.0,Proviamo qualcosa!,"[Let, ', s, try, something, .]","[Proviamo, qualcosa, !]"
30,1292,I don't know if I have the time.,383017.0,Non so se ho tempo.,"[I, don, ', t, know, if, I, have, the, time, .]","[Non, so, se, ho, tempo, .]"
39,1296,You are in my way.,568681.0,Sei dalla mia parte.,"[You, are, in, my, way, .]","[Sei, dalla, mia, parte, .]"
49,1301,It's because you don't want to be alone.,568702.0,È perché non vuoi essere solo.,"[It, ', s, because, you, don, ', t, want, to, ...","[È, perché, non, vuoi, essere, solo, .]"
53,1304,I'll do my best not to disturb your studying.,568710.0,Farò del mio meglio per non disturbare il tuo ...,"[I, ', ll, do, my, best, not, to, disturb, you...","[Farò, del, mio, meglio, per, non, disturbare,..."


English vocabulary size: 23654
Italian vocabulary size: 33418


In [None]:
df['src_tokenized'].values

In [None]:
#pad

In [None]:
#comprehensive function to get all data
def data_preprocessing(pytorch_tensor="pytorch", store=True):
  #english to italian translation
  df_eng_it = get_file_and_disp(fileName="Sentence pairs in English-Italian - 2024-07-26.tsv" ,sep='\t', stringName="English to Italian")

  #remove nulls
  df_eng_it = count_remove_null(df_eng_it, index=1, name="English")
  df_eng_it = count_remove_null(df_eng_it, index=3, name="Italian")

  #rename columns
  df_eng_it.columns = ['eng_id', 'eng_sentence', 'it_id', 'it_sentence']

  #get 1-1 translations
  df_eng_it_sing = one_to_one_translations(df_eng_it)

  #tokenize



Extra: To see if a sentence is the original or not

In [91]:
# #get file
# df_sb = get_file_and_disp(fileName="sentences_base.csv",sep=None, stringName="Base Sentences")

# #remove nulls
# df_sb = count_remove_null(df_sb, index=1, name="Sentence Base")

# #rename columns
# df_sb.columns = ['id', 'base_sentence']

In [90]:
# #lets look at distribution of base sentences

# # zero: The sentence is original, not a translation of another.
# # greater than zero: The id of the sentence from which it was translated.
# # \N: Unknown (rare).

# #distribution of values
# df_sb_vc = df_sb['base_sentence'].value_counts().reset_index()
# df_sb_vc.head(10)

In [None]:
#merge with
