<a href="https://colab.research.google.com/github/smargetic/Natural_Language_Processing/blob/main/Machine_Translation/Data_Intake_and_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#all translations have been obtained at tatoeba.org on July 26th, 2024

In [2]:
#data storage
import pandas as pd
import numpy as np

#show all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#visualize
from IPython.display import display

#pytorch
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

#tensorflow
import tensorflow as tf
#from tensorflow.keras.preprocessing.sequence import pad_sequence as tf_pad_sequence

#word/sentence processing
import re

#store data
import pickle

In [3]:
#import files
def import_file(fileName, sep=None):
  df = pd.read_csv(fileName, sep=sep, on_bad_lines='warn').T.reset_index().T.reset_index(drop=True)
  return df

#display file with name
def get_file_and_disp(fileName, sep=None, stringName=""):
  print("\n" + stringName + ":")
  df = import_file(fileName, sep=sep)
  display(df.head())
  return df

# #english to italian translation
# df_eng_it = get_file_and_disp(fileName="Sentence pairs in English-Italian - 2024-07-26.tsv" ,sep='\t', stringName="English to Italian")


In [4]:
#double check that there are no null versions for these sentences
def count_remove_null(df, index=1, name=""):
  #count nulls
  nulls = df[index].isnull().sum()+ df[index].eq("").sum()
  print("Nulls in " + name + ": {}".format(nulls))

  #remove nulls
  if(nulls>0):
    df.dropna(subset=[index], inplace=True)
    df = df[df[index]!=""]
    df.reset_index(drop=True, inplace=True)

  return df


# df_eng_it = count_remove_null(df_eng_it, index=1, name="English")
# df_eng_it = count_remove_null(df_eng_it, index=3, name="Italian")

In [5]:
# #rename columns
# df_eng_it.columns = ['eng_id', 'eng_sentence', 'it_id', 'it_sentence']
# df_eng_it.head()

In [6]:
#seperates 1-1 translations
def one_to_one_translations(df):
  df_sing = df.copy().loc[~df.duplicated(subset='eng_sentence', keep=False), :]
  df_sing = df_sing.loc[~df_sing.duplicated(subset='it_sentence',keep=False), :]
  df_sing.reset_index(drop=True, inplace=True)

  print('\nOrig size: {}'.format(df.shape[0]))
  print('Singular size: {}'.format(df_sing.shape[0]))

  return df_sing

# df_eng_it_sing = one_to_one_translations(df_eng_it)

In [7]:
##tokenize

#get tokens
def tokenize(sentence):
    return re.findall(r'\b\w+\b|[^\w\s]', sentence)

#get vocabulary as dictionary - with values as indecies
def get_vocab(df, col):
  unique_tokens = np.unique(np.hstack(np.array(df[col])))
  vocab = {k: v+1 for v, k in enumerate(unique_tokens)} #0 will be padding
  return vocab

#encode for vocab
def encode_vocab(tokens, vocab):
    return [vocab[token] for token in tokens]

#full tokenization - returns modified pandas df, and vocabs
def tokenize_full(df, name=""):
  print('\n'+name+":")

  #split words into tokens
  df['eng_tokens'] = df['eng_sentence'].apply(tokenize)
  df['it_tokens'] = df['it_sentence'].apply(tokenize)

  #get vocab
  eng_vocab = get_vocab(df, 'eng_tokens')
  it_vocab = get_vocab(df, 'it_tokens')

  print("\tEnglish vocabulary size: {}".format(len(eng_vocab)))
  print("\tItalian vocabulary size: {}".format(len(it_vocab)))

  #encode for vocab
  df['eng_tokens_enc'] = df['eng_tokens'].apply(lambda x: encode_vocab(x, eng_vocab))
  df['it_tokens_enc'] = df['it_tokens'].apply(lambda x: encode_vocab(x, it_vocab))

  return df, eng_vocab, it_vocab

# #tokenize
# df_eng_it, eng_vocab, it_vocab = tokenize_full(df_eng_it, name="English/Italian Translations")
# df_eng_it_sing, eng_vocab_sing, it_vocab_sing = tokenize_full(df_eng_it_sing, name="English/Italian One-One Translations")


In [8]:
### pytorch ###

#turn column of lists to column of pytorch tensors
def turn_list_col_pytorch(df, col):
  new_col = col + '_p'
  df[new_col] = df[col].apply(lambda x: torch.tensor(x))
  return df

#add padding
def pytorch_pad(df, col):
  vals = pad_sequence(df[col], batch_first=True, padding_value=0)
  df[col] = list(vals)

  return df, vals

#seperate data into batches
def pytorch_batch(eng_tens, it_tens, batch_size=64):
  dataset = TensorDataset(eng_tens, it_tens)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  return dataloader

#all processing to turn list columns to pytorch objects
def pytorch_preproc(df):
  #turn column of lists to column of pytorch tensors
  df = turn_list_col_pytorch(df, 'eng_tokens_enc')
  df = turn_list_col_pytorch(df, 'it_tokens_enc')

  #add padding
  df, eng_tens = pytorch_pad(df, 'eng_tokens_enc_p')
  df, it_tens = pytorch_pad(df, 'it_tokens_enc_p')

  #get batches
  dataloader = pytorch_batch(eng_tens, it_tens)

  # return df, eng_tens, it_tens, dataloader
  return df, dataloader


# # df_eng_it, eng_tens, it_tens, dataloader = pytorch_preproc(df_eng_it)
# df_eng_it, dataloader_p = pytorch_preproc(df_eng_it)
# df_eng_it.head()

In [9]:
### tensorflow ###

#add padding
def tensorflow_pad(df, col):
  new_col = col + '_t'
  df[new_col] = list(tf.keras.preprocessing.sequence.pad_sequences(df[col].tolist(), padding='post'))
  return df

#turn list column to list of tensorflow objects
def turn_list_col_tensorflow(df, col1, col2):
  dataset = tf.data.Dataset.from_tensor_slices((df[col1], df[col2]))
  return dataset

#padding and batch
def tensorflow_pad_batch(dataset, batch_size=64):
  padded_shapes = ([None], [None])
  padding_values = (tf.constant(0, dtype=tf.int32), tf.constant(0, dtype=tf.int32))

  dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes, padding_values=padding_values)

  return dataset

#all processing to turn list columns to tensorflow objects
def tensorflow_preproc(df):
  #turn column of lists to column of tensorflow tensors
  df = tensorflow_pad(df, 'eng_tokens_enc')
  df = tensorflow_pad(df, 'it_tokens_enc')

  #turn list column to list of tensorflow objects
  dataset = turn_list_col_tensorflow(df, 'eng_tokens_enc_t', 'it_tokens_enc_t')

  #add padding and batch
  dataset = tensorflow_pad_batch(dataset)

  return df, dataset

# df_eng_it, dataloader_t = tensorflow_preproc(df_eng_it)
# df_eng_it.head()

In [10]:
# # Example usage
# for input_batch, target_batch in dataset:
#     print("Input batch:", input_batch)
#     print("Target batch:", target_batch)
#     break

In [11]:
#comprehensive function to get all data
def data_preprocessing(pytorchB=True, tensorflowB=True, store=True):
  #english to italian translation
  df_eng_it = get_file_and_disp(fileName="Sentence pairs in English-Italian - 2024-07-26.tsv" ,sep='\t', stringName="English to Italian")

  #remove nulls
  print('\n')
  df_eng_it = count_remove_null(df_eng_it, index=1, name="English")
  df_eng_it = count_remove_null(df_eng_it, index=3, name="Italian")

  #rename columns
  df_eng_it.columns = ['eng_id', 'eng_sentence', 'it_id', 'it_sentence']

  #get 1-1 translations
  df_eng_it_sing = one_to_one_translations(df_eng_it)

  #tokenize
  df_eng_it, eng_vocab, it_vocab = tokenize_full(df_eng_it, name="English/Italian Translations")
  df_eng_it_sing, eng_vocab_sing, it_vocab_sing = tokenize_full(df_eng_it_sing, name="English/Italian One-One Translations")

  #turn into pytorch
  dataloader_p, dataloader_p_sing = None, None
  if(pytorchB):
    df_eng_it, dataloader_p = pytorch_preproc(df_eng_it)
    df_eng_it_sing, dataloader_p_sing = pytorch_preproc(df_eng_it_sing)

  #turn into tensorflow
  dataloader_t, dataloader_t_sing = None, None
  if(tensorflowB):
    df_eng_it, dataloader_t = tensorflow_preproc(df_eng_it)
    df_eng_it_sing, dataloader_t_sing = tensorflow_preproc(df_eng_it_sing)

  #make dictionary of values
  data_dict = {"df_eng_it": df_eng_it, "df_eng_it_sing": df_eng_it_sing,
               "dataloader_p": dataloader_p, "dataloader_t": dataloader_t,
               "dataloader_p_sing": dataloader_p_sing, "dataloader_t_sing": dataloader_t_sing,
               "eng_vocab": eng_vocab, "it_vocab": it_vocab,
               "eng_vocab_sing": eng_vocab_sing, "it_vocab_sing": it_vocab_sing}

  #store dictionary values
  if(store):
    with open('nn_data_dict.pkl', 'wb') as fp:
        pickle.dump(data_dict, fp)
        print('dictionary saved successfully to file')

  return data_dict

data_preprocessing()



English to Italian:


FileNotFoundError: [Errno 2] No such file or directory: 'Sentence pairs in English-Italian - 2024-07-26.tsv'