GenAI (GPT-5 and GEMINI) has been used to do this project as a helping tool. This project has been created only by Roberto Punzano, without the help of any other human.

## 0. Importing Libraries

In [4]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy #for tokenization and POS Tagging
from scipy.stats import spearmanr #scipy used to compute Spearman Correlation

## 1. Preparing the Data and Tokenizing

In [6]:
# Opening the .txt for the Spanish Song
with open("/content/gasolina.txt", encoding="utf-8") as f:
    spanish_text = f.read()

In [7]:
# Opening the .txt for the English Song
with open("/content/its_too_late.txt", encoding="utf-8") as f:
    english_text = f.read()

In [8]:
# Tokenizing the Text by Words with SpaCy
def tokenize_text(text, language="en"):
    if language == "es":
        nlp = spacy.load("es_core_news_sm")
    elif language == "en":
        nlp = spacy.load("en_core_web_sm")

    doc = nlp(text)   #tokenizing the text
    words = [token.text.lower() for token in doc if token.is_alpha]   #converting all tokens into lower case if they are letters
    return words      #list of words

In [9]:
# Applying SpaCy Tokenization Function
english_tokenized = tokenize_text(english_text, language="en")
spanish_tokenized = tokenize_text(spanish_text, language="es")

## 2. Creating a Dictionary with Words and their Frequency

In [10]:
# Mapping the Words with their Frequencies

def getting_frequencies(tokenized_text):
  word_frequency = {}      #creating a dictionary

  for word in tokenized_text:
    if word in word_frequency:    #if the word is in the dictionary
      word_frequency[word] += 1   #we add 1 to the value store frequency
    else:
      word_frequency[word] = 1    #if it has not been seen yet, we set its value to 1
  return word_frequency

In [11]:
# Applying Frequencies Function
english_frequencies = getting_frequencies(english_tokenized)
spanish_frequencies = getting_frequencies(spanish_tokenized)

## 3. Adding Length

In [12]:
# We add Length with another dictionary

def freq_and_length_dict(frequencies):
  word_freq_length = {}

  for word, freq in frequencies.items():   #here we take the previous dictionary
    length = len(word)                        #we create a new variable storing length
    word_freq_length[word] = (freq, length)   #and we map it into a tupple as a value
  return word_freq_length

In [13]:
# Applying Length Function
english_data = freq_and_length_dict(english_frequencies)
spanish_data = freq_and_length_dict(spanish_frequencies)

## 4. Computing Spearman Correlation to Test Zipf's Law of Abbreviaton

In [14]:
# Converting the Tuple into 2 Lists (needed for SciPy)

def tuple_into_lists(data_dict):
  frequencies = []
  lengths = []

  for freq, length in data_dict.values():    #for all the values
      frequencies.append(freq)    #append frequencies into the new frequencies list
      lengths.append(length)      #append length into the new length list
  return frequencies, lengths

In [15]:
# Applying Tupple to List Function
english_lists = tuple_into_lists(english_data)
spanish_lists = tuple_into_lists(spanish_data)

In [16]:
# We compute Spearman Correlation with SciPy
def spearman_correlation(frequencies, lengths):
  rho, p_value = spearmanr(frequencies, lengths)   #computing rho and p-value by taking freqs and lengths
  return rho, p_value

In [17]:
# Computing Rho and P-value
"""
These lines take the Spearman Correlation function and applies it to the Spanish and
English data. The * operator unpacks the two elements in the list and
"""
english_rho, english_p_value = spearman_correlation(*english_lists)
spanish_rho, spanish_p_value = spearman_correlation(*spanish_lists)

In [18]:
# ENGLISH RESULTS
print("ENGLISH RESULTS:")
print("Spearman rho:", english_rho)
print("p-value:", english_p_value)

ENGLISH RESULTS:
Spearman rho: -0.35618333962927606
p-value: 0.0011839854793753543


In [19]:
# SPANISH RESULTS
print("SPANISH RESULTS:")
print("Spearman rho:", spanish_rho)
print("p-value:", spanish_p_value)

SPANISH RESULTS:
Spearman rho: -0.506177254088844
p-value: 6.7678569425985604e-09


## 5. Qualitative Analysis

In [20]:
english_data

{'stayed': (1, 6),
 'in': (1, 2),
 'bed': (1, 3),
 'all': (1, 3),
 'morning': (1, 7),
 'just': (5, 4),
 'to': (4, 2),
 'pass': (1, 4),
 'the': (1, 3),
 'time': (1, 4),
 'there': (3, 5),
 'something': (2, 9),
 'wrong': (1, 5),
 'here': (2, 4),
 'can': (1, 3),
 'be': (3, 2),
 'no': (4, 2),
 'denying': (1, 7),
 'one': (1, 3),
 'of': (1, 2),
 'us': (1, 2),
 'is': (1, 2),
 'changing': (1, 8),
 'or': (1, 2),
 'maybe': (1, 5),
 'we': (4, 2),
 'stopped': (1, 7),
 'trying': (1, 6),
 'and': (8, 3),
 'it': (8, 2),
 'too': (6, 3),
 'late': (5, 4),
 'baby': (2, 4),
 'now': (3, 3),
 'though': (1, 6),
 'really': (1, 6),
 'did': (1, 3),
 'try': (1, 3),
 'make': (1, 4),
 'inside': (1, 6),
 'has': (1, 3),
 'died': (1, 4),
 'i': (6, 1),
 'ca': (3, 2),
 'hide': (1, 4),
 'fake': (1, 4),
 'oh': (1, 2),
 'used': (1, 4),
 'so': (2, 2),
 'easy': (1, 4),
 'living': (1, 6),
 'with': (1, 4),
 'you': (6, 3),
 'were': (1, 4),
 'light': (1, 5),
 'breezy': (1, 6),
 'knew': (1, 4),
 'what': (2, 4),
 'do': (2, 2),
 'lo

In [21]:
spanish_data

{'zúmbale': (1, 7),
 'mambo': (1, 5),
 'para': (9, 4),
 'que': (13, 3),
 'mis': (1, 3),
 'gatas': (2, 5),
 'prendan': (1, 7),
 'los': (6, 3),
 'motores': (2, 7),
 'se': (5, 2),
 'preparen': (1, 8),
 'lo': (3, 2),
 'viene': (1, 5),
 'es': (2, 2),
 'le': (5, 2),
 'den': (1, 3),
 'duro': (4, 4),
 'mamita': (1, 6),
 'yo': (2, 2),
 'sé': (1, 2),
 'tú': (5, 2),
 'no': (7, 2),
 'te': (3, 2),
 'me': (5, 2),
 'vas': (1, 3),
 'a': (4, 1),
 'quitar': (1, 6),
 'gusta': (2, 5),
 'dejas': (1, 5),
 'llevar': (1, 6),
 'todos': (2, 5),
 'fines': (1, 5),
 'de': (6, 2),
 'semana': (1, 6),
 'ella': (4, 4),
 'sale': (1, 4),
 'vacilar': (1, 7),
 'mi': (1, 2),
 'gata': (1, 4),
 'janguear': (1, 8),
 'porque': (1, 6),
 'la': (6, 2),
 'gasolina': (4, 8),
 'dame': (2, 4),
 'más': (3, 3),
 'cómo': (1, 4),
 'encanta': (1, 7),
 'prende': (1, 6),
 'las': (6, 3),
 'turbinas': (1, 8),
 'discrimina': (1, 10),
 'pierde': (2, 6),
 'ni': (1, 2),
 'una': (1, 3),
 'fiesta': (1, 6),
 'marquesina': (1, 10),
 'acicala': (1, 7)