<a href="https://colab.research.google.com/github/smargetic/Natural_Language_Processing/blob/main/Machine_Translation/Data_Intake_and_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#all translations have been obtained at tatoeba.org on July 25th, 2024

In [7]:
!pip install gensim



In [9]:
#data storage
import pandas as pd
import numpy as np

#show all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#visualize
from IPython.display import display

#pytorch
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

#tensorflow
import tensorflow as tf

In [5]:
#import files
def import_file(fileName, sep=None):
  df = pd.read_csv(fileName, sep=sep).T.reset_index().T.reset_index(drop=True)
  # df.columns = df.columns.astype(str)
  return df

def get_file_and_disp(fileName, sep=None, stringName=""):
  print("\n" + stringName + ":")
  df = import_file(fileName, sep=sep)
  display(df.head())
  return df


df_it_CC0 = get_file_and_disp(fileName="ita_sentences_CC0.tsv" ,sep='\t', stringName="Italian Sentences CC0")
df_eng_CC0 = get_file_and_disp(fileName="ita_sentences_CC0.tsv" ,sep='\t', stringName="English Sentences CC0")

df_it = get_file_and_disp(fileName="ita_sentences.tsv" ,sep='\t', stringName="Italian Sentences")
df_eng = get_file_and_disp(fileName="eng_sentences.tsv" ,sep='\t', stringName="English Sentences")

df_it_det = get_file_and_disp(fileName="ita_sentences_detailed.tsv" ,sep='\t', stringName="Italian Sentences Detailed")
df_eng_det = get_file_and_disp(fileName="eng_sentences_detailed.tsv" ,sep='\t', stringName="English Sentences Detailed")

#helps to see if a sentence is the original or translation of another
df_sb = get_file_and_disp(fileName="sentences_base.csv",sep=None, stringName="Base Sentences")



Italian Sentences CC0:


Unnamed: 0,0,1,2,3
0,8062028,ita,La litigata tra Tom e Mary è finita a taralluc...,2019-07-19 15:28:26
1,8080974,ita,Tu lo stai prendendo troppo seriamente.,2019-07-28 18:00:22
2,8203018,ita,Scrivimelo in una frase.,2021-06-07 17:19:34
3,8306339,ita,Non mi fare la supercazzola!,2019-11-06 14:08:04
4,8306410,ita,Ieri ho parlato con il mio meccanico a proposi...,2019-11-06 14:39:02



English Sentences CC0:


Unnamed: 0,0,1,2,3
0,8062028,ita,La litigata tra Tom e Mary è finita a taralluc...,2019-07-19 15:28:26
1,8080974,ita,Tu lo stai prendendo troppo seriamente.,2019-07-28 18:00:22
2,8203018,ita,Scrivimelo in una frase.,2021-06-07 17:19:34
3,8306339,ita,Non mi fare la supercazzola!,2019-11-06 14:08:04
4,8306410,ita,Ieri ho parlato con il mio meccanico a proposi...,2019-11-06 14:39:02



Italian Sentences:


FileNotFoundError: [Errno 2] No such file or directory: 'ita_sentences.tsv'

In [None]:
#lets look at distribution of base sentences

# zero: The sentence is original, not a translation of another.
# greater than zero: The id of the sentence from which it was translated.
# \N: Unknown (rare).

#distribution of values
df_sb_vc = df_sb[1].value_counts().reset_index()
df_sb_vc.head(10)

In [13]:
#double check that there are no null versions for these sentences
null_list =[(df[2].isnull().sum()+ df[2].eq("").sum())for df in [df_it_CC0, df_eng_CC0, df_it, df_eng, df_it_det, df_eng_det]]
null_list.append((df_sb[1].isnull().sum()+df_sb[1].eq("").sum()))

print('Total nulls in all dataframes:')
print(null_list)

NameError: name 'df_it' is not defined

In [None]:
#rename columns for merging

def rename_cols(df, it_eng="it",name=""):
  lang_string = it_eng+"_sentence"
  df = df.rename(columns={0:'id', 2:lang_string})
  print('\n'+name+':')
  display(df.head())
  return df


df_it_CC0 = rename_cols(df_it_CC0, 'it', "Italian Sentences CC0")
df_eng_CC0 = rename_cols(df_eng_CC0, 'eng', "English Sentences CC0")

df_it = rename_cols(df_it, 'it', "Italian Sentences")
df_eng = rename_cols(df_eng, 'eng', "English Sentences")

df_it_det = rename_cols(df_it_det, 'it', "Italian Sentences Detailed")
df_eng_det = rename_cols(df_eng_det, 'eng', "English Sentences Detailed")

df_sb = df_sb.rename(columns={0:"id"})

In [None]:
#merge english and italian sentences
def merge_sentence(df1, df2, name=""):
  df = df1.copy().merge(df2.copy(),on="id",how='inner')

  print("\n"+name+":")
  print('Original Italian Length {}'.format(len(df1)))
  #for my own peeksie
  print('compare this to: {}'.format(df.shape[1]))

  print('Original English Length {}'.format(len(df2)))
  print('Merged Length {}'.format(len(df)))
  return df

df_it_eng_CC0 = merge_sentence(df_it_CC0, df_eng_CC0, "Italian to English CC)")
df_it_eng = merge_sentence(df_it, df_eng, "Italian to English")
df_it_eng_det = merge_sentence(df_it_det, df_eng_det, "Italian to English Detailed")

In [None]:
#want english to be original and italian to be translation (from base senteces) (handles 1-many possible error)

In [11]:
#tokenize

In [None]:
#pad

In [None]:
#comprehensive function to get all data
def data_preprocessing(pytorch_tensor="pytorch", store=True):

  ###import data
  df_it_CC0 = get_file_and_disp(fileName="ita_sentences_CC0.tsv" ,sep='\t', stringName="Italian Sentences CC0")
  df_eng_CC0 = get_file_and_disp(fileName="ita_sentences_CC0.tsv" ,sep='\t', stringName="English Sentences CC0")

  df_it = get_file_and_disp(fileName="ita_sentences.tsv" ,sep='\t', stringName="Italian Sentences")
  df_eng = get_file_and_disp(fileName="eng_sentences.tsv" ,sep='\t', stringName="English Sentences")

  df_it_det = get_file_and_disp(fileName="ita_sentences_detailed.tsv" ,sep='\t', stringName="Italian Sentences Detailed")
  df_eng_det = get_file_and_disp(fileName="eng_sentences_detailed.tsv" ,sep='\t', stringName="English Sentences Detailed")

  #helps to see if a sentence is the original or translation of another
  df_sb = get_file_and_disp(fileName="sentences_base.csv",sep=None, stringName="Base Sentences")

  ###lets look at distribution of base sentences
  # zero: The sentence is original, not a translation of another.
  # greater than zero: The id of the sentence from which it was translated.
  # \N: Unknown (rare).

  print('\nDistribution of Base Sentences:')
  df_sb_vc = df_sb[1].value_counts().reset_index()
  display(df_sb_vc.head(10))


  ##rename columns for merging
  df_it_CC0 = rename_cols(df_it_CC0, 'it', "Italian Sentences CC0")
  df_eng_CC0 = rename_cols(df_eng_CC0, 'eng', "English Sentences CC0")

  df_it = rename_cols(df_it, 'it', "Italian Sentences")
  df_eng = rename_cols(df_eng, 'eng', "English Sentences")

  df_it_det = rename_cols(df_it_det, 'it', "Italian Sentences Detailed")
  df_eng_det = rename_cols(df_eng_det, 'eng', "English Sentences Detailed")

  df_sb = df_sb.rename(columns={0:"id"})

  ###merge sentences
  df_it_eng_CC0 = merge_sentence(df_it_CC0, df_eng_CC0, "Italian to English CC)")
  df_it_eng = merge_sentence(df_it, df_eng, "Italian to English")
  df_it_eng_det = merge_sentence(df_it_det, df_eng_det, "Italian to English Detailed")

  ###only get data where english is source and italian is translation (handles 1-many possible error)




