# Imports

In [4]:
%pip install datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
from collections import Counter
from datasets import load_dataset

# Get Data

In [6]:
translation_dataset = load_dataset("Nicolas-BZRD/Parallel_Global_Voices_English_French", split="train").to_pandas()
translation_dataset.head()

Unnamed: 0,en,fr
0,Jamaica: “I am HIV”,Jamaïque : J’ai le VIH
1,"It's widely acknowledged, in the Caribbean and...","Il est largement reconnu, dans les Caraïbes et..."
2,"For this woman, however, photographed in the s...","Pour cette femme, cependant, photographiée dan..."
3,As Bacon writes on her blog:,Comme Bacon écrit sur son blog:
4,"“When I asked to take her picture, I suggested...",“Quand je lui ai demandé de la prendre en phot...


# Labels

In [7]:
row_labels = translation_dataset.index.values
print(row_labels)

[     0      1      2 ... 342057 342058 342059]


In [8]:
column_labels = translation_dataset.columns
print(column_labels)

Index(['en', 'fr'], dtype='object')


# Count the Data

Amount of examples

In [9]:
row_number = translation_dataset.index.values[-1] + 1
print(row_number)

342060


Total amount of words

In [10]:
english = translation_dataset.values[1][0]
print(english)
french = translation_dataset.values[1][1]
print(french)

It's widely acknowledged, in the Caribbean and elsewhere, that the fear of stigma and discrimination is a major factor preventing people with HIV/AIDS from seeking treatment or from admitting their HIV status publicly.
Il est largement reconnu, dans les Caraïbes et ailleurs, que la crainte de la honte et de la discrimination est un facteur important empêchant les personnes atteintes du VIH/SIDA de chercher du traitement ou d'admettre publiquement leur statut de VIH.


In [11]:
count_english = 0
count_french = 0
for row_label in row_labels:
    english = translation_dataset.values[row_label][0]
    french = translation_dataset.values[row_label][1]

    for e in english:
        if e == " ":
            count_english += 1
    for f in french:
        if f == " ":
            count_french += 1

In [12]:
print(count_english)
print(count_french)

6296101
7042278


Average amount of words per Line

In [13]:
print(count_english / row_number)
print(count_french / row_number)

18.406422849792435
20.58784423785301


# Frequent words

Get the data together

English

In [14]:
english_words = []
french_words = []

for row_label in row_labels:
    english_words.append(translation_dataset.values[row_label][0].split())
    french_words.append(translation_dataset.values[row_label][1].split())

In [15]:
# Get the set of unique words.
unique_english_words = []
for word in english_words:
  if word not in unique_english_words:
    unique_english_words.append(word)

In [17]:
# Make a list of (count, unique) tuples.
count_english_words = []
for unique in unique_english_words:
  count = 0              # Initialize the count to zero.
  for word in english_words:     # Iterate over the words.
    if word == unique:   # Is this word equal to the current unique?
      count += 1         # If so, increment the count
  count_english_words.append((count, unique))

In [18]:
count_english_words.sort()            # Sorting the list puts the lowest counts first.
count_english_words.reverse()         # Reverse it, putting the highest counts first.
# Print the ten words with the highest counts.

In [19]:
for i in range(min(10, len(count_english_words))):
  count, word = count_english_words[i]
  print('%s %d' % (word, count))

['Used', 'with', 'permission.'] 31
['This', 'post', 'is', 'part', 'of', 'our', 'International', 'Relations', '&', 'Security', 'coverage.'] 19
['This', 'post', 'and', 'its', 'translations', 'to', 'Spanish,', 'Arabic', 'and', 'French', 'were', 'commissioned', 'by', 'the', 'International', 'Security', 'Network', '(ISN)', 'as', 'part', 'of', 'a', 'partnership', 'to', 'seek', 'out', 'citizen', 'voices', 'on', 'international', 'relations', 'and', 'security', 'issues', 'worldwide.'] 19
['This', 'post', 'was', 'first', 'published', 'on', 'the', 'ISN', 'blog,', 'see', 'similar', 'stories', 'here.'] 17
['This', 'post', 'is', 'part', 'of', 'our', 'special', 'coverage', 'Tunisia', 'Revolution', '2011.'] 17
['This', 'post', 'is', 'part', 'of', 'our', 'special', 'coverage', 'Libya', 'Uprising', '2011.'] 17
['This', 'post', 'is', 'part', 'of', 'our', 'special', 'coverage', 'Syria', 'Protests', '2011.'] 16
['This', 'post', 'is', 'part', 'of', 'our', 'special', 'coverage', 'of', 'Bahrain', 'Protests', 

French


In [20]:
# Get the set of unique words.
unique_french_words = []
for word in french_words:
  if word not in unique_french_words:
    unique_french_words.append(word)

In [21]:
# Make a list of (count, unique) tuples.
count_french_words = []
for unique in unique_french_words:
  count = 0              # Initialize the count to zero.
  for word in french_words:     # Iterate over the words.
    if word == unique:   # Is this word equal to the current unique?
      count += 1         # If so, increment the count
  count_french_words.append((count, unique))

In [22]:
count_french_words.sort()            # Sorting the list puts the lowest counts first.
count_french_words.reverse()         # Reverse it, putting the highest counts first.

In [23]:
# Print the ten words with the highest counts.
for i in range(min(10, len(count_french_words))):
  count, word = count_french_words[i]
  print('%s %d' % (word, count))

['Ce', 'billet', 'fait', 'partie', 'du', 'dossier', 'de', 'Global', 'Voices', 'sur', "l'Europe", 'en', 'crise.'] 8
["Qu'en", 'pensez-vous', '?'] 6
['Mais', 'ce', "n'est", 'pas', 'tout.'] 6
['Et', "d'ajouter", ':'] 6
['Voici', 'quelques', 'réactions', 'sur', 'Twitter', ':'] 5
['En', 'voici', 'quelques', 'unes', ':'] 5
['[Tous', 'les', 'liens', 'sont', 'en', 'espagnol,', 'sauf', 'mention', 'contraire]'] 4
['[Les', 'liens', 'de', 'ce', 'billet', 'renvoient', 'vers', 'des', 'pages', 'web', 'en', 'espagnol.]'] 4
['Photo', 'utilisée', 'avec', 'permission.'] 4
['Le', 'blogueur', 'poursuit', ':'] 4
