In [1]:
import pandas as pd
import re
from langdetect import detect
from googletrans import Translator
from easynmt import EasyNMT
from tqdm import tqdm
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from transformers import pipeline

In [None]:
data = pd.read_csv('/Users/sergiopicascia/Documents/GitHub/thesis/Data/data.csv', index_col=0)

In [None]:
data.head()

In [None]:
lyrics_raw = data['lyrics']

## Chorus Retrieval and Translation

In [None]:
lyrics_plain = []
choruses = []
gtranslator = Translator()
translator = EasyNMT('opus-mt')

In [None]:
# Retrieve chorus and translate every non-English song
for lyr in tqdm(lyrics_raw):
    
    # Detect language of text
    lang = detect(lyr)
    
    # Retrieve chorus if present
    try:
        start = lyr.index('[Chorus]') + len('[Chorus]')
        end = lyr[start:].index('[')
        chorus = lyr[start:start+end]
        chorus = [sent for sent in chorus.split('\n') if sent]
        
        # If not in english, translate
        if lang != 'en':
            try:
                chorus = [translator.translate(s, source_lang=lang, target_lang='en') for s in chorus]
            except:
                chorus = [gtranslator.translate(s, src=lang, dest='en').text for s in chorus]
            
        choruses.append(chorus)
        
    except:
        choruses.append('')

    # Remove headings
    lyr = re.sub(r'\[.*\]', '', lyr)

    # Remove ending line
    lyr = lyr.replace('EmbedShare URLCopyEmbedCopy', '')

    # Split in sentences
    lyr = [sent for sent in lyr.split('\n') if sent]
    
    # If not in english, translate
    if lang != 'en':
        try:
            lyr = [translator.translate(s, source_lang=lang, target_lang='en') for s in lyr]
        except:
            lyr = [gtranslator.translate(s, src=lang, dest='en').text for s in lyr]
    
    lyrics_plain.append(lyr)

In [None]:
with open("lyrics_plain.txt", "w") as output:
    output.write(str(lyrics_plain))

## Extractive Text Summarization

In [None]:
# Split song sentences in words
def song_split(song):
    sentences = list(filter(None, [list(filter(None, re.sub("[^a-zA-Z]", " ", sent).split(" "))) for sent in song]))
    return sentences

# Remove duplicate sentences
def remove_duplicates(sentences):
    no_dup = []
    for sent in sentences:
        if sent not in no_dup:
            no_dup.append(sent)       
    return no_dup

# Compute cosine similarity between sentences
def sentence_similarity(sent1, sent2):
    
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
     
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
# Build the similarity matrix
def build_similarity_matrix(sentences):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])

    return similarity_matrix


def generate_summary(song):
    summarized_text = []

    # Split song and remove duplicate sentences
    sentences = song_split(song)
    sentences = remove_duplicates(sentences)

    # Generate similary martix
    sentence_similarity_martix = build_similarity_matrix(sentences)

    # Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Sort sentences and pick top 4
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)      

    for i in range(4):
      summarized_text.append(" ".join(ranked_sentence[i][1]))

    # Return summarized text
    return summarized_text

In [None]:
# Generate choruses for the ones missing
for idx, song in enumerate(lyrics_plain):
    if not choruses[idx]:
        try:
            choruses[idx] = generate_summary(song)
        except:
            continue

In [221]:
with open("choruses.txt", "w") as output:
    output.write(str(choruses))

## Weighted Lyrics

In [214]:
lyrics_weighted = []

for idx, song in enumerate(lyrics_plain):
    lyr_w = []
    # Retrieve corresponding chorus
    chorus = choruses[idx]
    # If chorus exists, assign an higher weight to sentences composing it
    if chorus:
        for sent in song:
            sent = re.sub("[^a-zA-Z\s]", "", sent)
            if sent in chorus:
                lyr_w.append((sent, 5))
            else:
                lyr_w.append((sent, 1))
    else:
        for sent in song:
            lyr_w.append((sent, 1))
            
    lyrics_weighted.append(lyr_w)

In [220]:
with open("lyrics_weighted.txt", "w") as output:
    output.write(str(lyrics_weighted))

## Abstractive Text Summarization

In [290]:
# Initialize T5 model
summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [291]:
lyrics_summarized = []

for lyr in tqdm(lyrics_plain):
    
    text = '. '.join(lyr)
    
    if len(text) >= 50:
        summary = summarizer(text, min_length=50, max_length=200)[0]['summary_text']
    else:
        summary = text
        
    lyrics_summarized.append(summary)

  0%|          | 0/2000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1096 > 512). Running this sequence through the model will result in indexing errors
  0%|          | 2/2000 [01:11<19:40:27, 35.45s/it]Your max_length is set to 200, but you input_length is only 126. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  1%|          | 21/2000 [10:27<17:36:10, 32.02s/it]Your max_length is set to 200, but you input_length is only 67. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  2%|▏         | 30/2000 [14:10<13:50:44, 25.30s/it]Your max_length is set to 200, but you input_length is only 188. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  2%|▏         | 33/2000 [15:27<14:49:16, 27.13s/it]Your max_length is set to 200, but you input_length is only 163. You might consider decreasing max_length man

 11%|█         | 223/2000 [1:43:19<15:43:03, 31.84s/it]Your max_length is set to 200, but you input_length is only 104. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 11%|█▏        | 225/2000 [1:44:15<14:52:41, 30.18s/it]Your max_length is set to 200, but you input_length is only 137. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 12%|█▏        | 235/2000 [1:49:12<16:51:44, 34.39s/it]Your max_length is set to 200, but you input_length is only 137. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 12%|█▏        | 237/2000 [1:50:17<16:48:36, 34.33s/it]Your max_length is set to 200, but you input_length is only 74. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 12%|█▏        | 241/2000 [1:52:08<14:30:47, 29.70s/it]Your max_length is set to 200, but you input_length is only 171. You might consider decreasing max_length manu

 32%|███▏      | 632/2000 [4:52:21<11:24:08, 30.01s/it]Your max_length is set to 200, but you input_length is only 188. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 32%|███▏      | 634/2000 [4:53:41<13:49:22, 36.43s/it]Your max_length is set to 200, but you input_length is only 169. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 32%|███▏      | 641/2000 [4:56:52<11:28:45, 30.41s/it]Your max_length is set to 200, but you input_length is only 161. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 32%|███▏      | 644/2000 [4:58:16<11:08:02, 29.56s/it]Your max_length is set to 200, but you input_length is only 116. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 33%|███▎      | 665/2000 [5:08:48<9:25:39, 25.42s/it] Your max_length is set to 200, but you input_length is only 131. You might consider decreasing max_length man

 40%|███▉      | 796/2000 [6:04:45<8:57:53, 26.81s/it]Your max_length is set to 200, but you input_length is only 190. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 40%|████      | 805/2000 [6:10:10<12:05:43, 36.44s/it]Your max_length is set to 200, but you input_length is only 148. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 40%|████      | 810/2000 [6:12:08<8:58:04, 27.13s/it] Your max_length is set to 200, but you input_length is only 152. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 41%|████      | 821/2000 [6:16:59<8:09:45, 24.92s/it]Your max_length is set to 200, but you input_length is only 167. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 41%|████▏     | 825/2000 [6:19:01<9:29:00, 29.06s/it]Your max_length is set to 200, but you input_length is only 195. You might consider decreasing max_length manual

 51%|█████     | 1013/2000 [7:38:35<7:03:52, 25.77s/it]Your max_length is set to 200, but you input_length is only 115. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 51%|█████     | 1015/2000 [7:39:17<6:31:23, 23.84s/it]Your max_length is set to 200, but you input_length is only 191. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 51%|█████     | 1021/2000 [7:42:46<9:15:05, 34.02s/it]Your max_length is set to 200, but you input_length is only 42. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 51%|█████     | 1022/2000 [7:43:10<8:30:03, 31.29s/it]Your max_length is set to 200, but you input_length is only 163. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 51%|█████     | 1024/2000 [7:44:00<7:47:13, 28.72s/it]Your max_length is set to 200, but you input_length is only 175. You might consider decreasing max_length manu

 73%|███████▎  | 1455/2000 [10:53:57<3:47:49, 25.08s/it]Your max_length is set to 200, but you input_length is only 90. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 73%|███████▎  | 1457/2000 [10:54:44<3:45:48, 24.95s/it]Your max_length is set to 200, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 73%|███████▎  | 1459/2000 [10:55:32<3:43:48, 24.82s/it]Your max_length is set to 200, but you input_length is only 199. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 73%|███████▎  | 1461/2000 [10:56:25<3:51:20, 25.75s/it]Your max_length is set to 200, but you input_length is only 179. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 73%|███████▎  | 1463/2000 [10:57:16<3:53:58, 26.14s/it]Your max_length is set to 200, but you input_length is only 123. You might consider decreasing max_length 

 90%|█████████ | 1808/2000 [13:29:56<1:18:35, 24.56s/it]Your max_length is set to 200, but you input_length is only 102. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 91%|█████████ | 1823/2000 [13:36:47<1:17:40, 26.33s/it]Your max_length is set to 200, but you input_length is only 101. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 91%|█████████▏| 1825/2000 [13:37:38<1:17:23, 26.53s/it]Your max_length is set to 200, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 92%|█████████▏| 1833/2000 [13:41:19<1:17:35, 27.88s/it]Your max_length is set to 200, but you input_length is only 133. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 92%|█████████▏| 1835/2000 [13:42:09<1:11:40, 26.06s/it]Your max_length is set to 200, but you input_length is only 139. You might consider decreasing max_length

 99%|█████████▉| 1980/2000 [14:43:52<08:13, 24.68s/it]Your max_length is set to 200, but you input_length is only 154. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 99%|█████████▉| 1981/2000 [14:44:10<07:09, 22.60s/it]Your max_length is set to 200, but you input_length is only 168. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 99%|█████████▉| 1983/2000 [14:45:00<06:46, 23.90s/it]Your max_length is set to 200, but you input_length is only 70. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
 99%|█████████▉| 1985/2000 [14:45:50<06:15, 25.01s/it]Your max_length is set to 200, but you input_length is only 188. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
100%|█████████▉| 1992/2000 [14:49:04<03:45, 28.17s/it]Your max_length is set to 200, but you input_length is only 111. You might consider decreasing max_length manually,

In [292]:
with open("lyrics_summarized.txt", "w") as output:
    output.write(str(lyrics_summarized))

## Embeddings

In [None]:
import ast
import spacy
import contractions
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram
from sentence_transformers import SentenceTransformer

nlp = spacy.load('en_core_web_lg')
sbert = SentenceTransformer('all-MiniLM-L6-v2')
sbert.max_seq_length = 256

In [21]:
# File loading
def load_txt(filepath):

    with open(filepath, 'r') as f:
            file_list = ast.literal_eval(f.read())
    
    return file_list

In [207]:
# Pipeline for SSWE
pipeline = Pipeline([
   NGramFeaturizer(word_feature_extractor=Ngram(),
                   char_feature_extractor=None,
                   keep_numbers=False,
                   output_tokens_column_name='sswe',
                   columns={'ngram': ['sentence']}),

   WordEmbedding(model_kind='SentimentSpecificWordEmbedding', columns='sswe')
])

### lyrics_plain

In [74]:
# Load file
lyrics_plain = load_txt("/Users/sergiopicascia/Documents/GitHub/thesis/Data/text/lyrics_plain.txt")

In [77]:
# Split by '. ' and flatten lists
for idx, lyrics in enumerate(lyrics_plain):
    for idx2, sentence in enumerate(lyrics):
        lyrics[idx2] = sentence.split('. ')
    lyrics_plain[idx] = [item for sublist in lyrics for item in sublist]

In [203]:
# Lemmatize text
lyrics_plain_lemma = []

for idx, lyrics in tqdm(enumerate(lyrics_plain)):
    for sentence in lyrics:
        sentence = contractions.fix(sentence)
        sentence = ' '.join([token.lemma_ for token in nlp(sentence)])
        
        lyrics_plain_lemma.append((sentence, idx))
        
lyrics_plain_lemma = pd.DataFrame(lyrics_plain_lemma, columns=['sentence', 'lyrics_idx'])

2000it [06:17,  5.30it/s]


In [208]:
# SSWE
lyrics_plain_sswe = pipeline.fit_transform(lyrics_plain_lemma)

In [225]:
lyrics_plain_emb = lyrics_plain_sswe.loc[:, "lyrics_idx":"sswe.149"].groupby('lyrics_idx').mean()

In [242]:
# sBERT
sent_emb = sbert.encode(list(lyrics_plain_lemma['sentence']))

In [None]:
col_names = ['sbert.'+str(i) for i in range(384)]
sent_emb = pd.DataFrame(sent_emb, columns=col_names)
sent_emb['lyrics_idx'] = lyrics_plain_lemma['lyrics_idx']
sent_emb = sent_emb.groupby('lyrics_idx').mean()

In [267]:
# Concatenate Embeddings
lyrics_plain_emb = pd.concat([lyrics_plain_emb, sent_emb], axis=1)

In [271]:
lyrics_plain_emb.to_csv('lyrics_plain_embeddings.csv')

### lyrics_weighted

In [272]:
lyrics_weighted = load_txt("/Users/sergiopicascia/Documents/GitHub/thesis/Data/text/lyrics_weighted.txt")

In [282]:
lyrics_weighted_lemma = []

for idx, lyrics in tqdm(enumerate(lyrics_weighted)):
    for sentence, weight in lyrics:
        
        sentence = contractions.fix(sentence)
        sentence = ' '.join([token.lemma_ for token in nlp(sentence)])
        
        lyrics_weighted_lemma.append((sentence, weight, idx))

2000it [06:17,  5.30it/s]


In [284]:
lyrics_weighted_lemma = pd.DataFrame(lyrics_weighted_lemma, columns=['sentence', 'weight', 'lyrics_idx'])

In [285]:
# SSWE
lyrics_weighted_sswe = pipeline.fit_transform(lyrics_weighted_lemma)

In [289]:
lyrics_weighted_sswe = lyrics_weighted_sswe.loc[:, "weight":"sswe.149"]

In [290]:
# sBERT
lyrics_weighted_sbert = sbert.encode(list(lyrics_weighted_lemma['sentence']))



In [294]:
lyrics_weighted_sbert = pd.DataFrame(lyrics_weighted_sbert, columns=col_names)

In [297]:
# Concatenate Embeddings
lyrics_weighted_emb = pd.concat([lyrics_weighted_sswe, lyrics_weighted_sbert], axis=1)

In [306]:
# Weighted Average
lyrics_weighted_emb = lyrics_weighted_emb.groupby('lyrics_idx').apply(lambda x: pd.Series(np.average(x[lyrics_weighted_emb.columns[2:]], weights=x["weight"], axis=0), [lyrics_weighted_emb.columns[2:]]))

In [307]:
lyrics_weighted_emb.to_csv('lyrics_weighted_embeddings.csv')

### choruses

In [308]:
choruses = load_txt('/Users/sergiopicascia/Documents/GitHub/thesis/Data/text/choruses.txt')

In [311]:
choruses_lemma = []

for chorus in tqdm(choruses):
    chorus = ', '.join(chorus)
    chorus = contractions.fix(chorus)
    chorus = ' '.join([token.lemma_ for token in nlp(chorus)])
    
    choruses_lemma.append(chorus)

100%|██████████| 2000/2000 [00:19<00:00, 103.20it/s]


In [314]:
choruses_lemma = pd.DataFrame(choruses_lemma, columns=['sentence'])

In [315]:
# SSWE
choruses_sswe = pipeline.fit_transform(choruses_lemma)

In [318]:
choruses_sswe = choruses_sswe.loc[:, "sswe.0":"sswe.149"]

In [319]:
# sBERT
choruses_sbert = sbert.encode(list(choruses_lemma['sentence']))



In [321]:
choruses_sbert = pd.DataFrame(choruses_sbert, columns=col_names)

In [322]:
# Concatenate Embeddings
choruses_emb = pd.concat([choruses_sswe, choruses_sbert], axis=1)

In [324]:
choruses_emb.to_csv('choruses_embeddings.csv')

### lyrics_summarized

In [325]:
lyrics_summarized = load_txt("/Users/sergiopicascia/Documents/GitHub/thesis/Data/text/lyrics_summarized.txt")

In [329]:
lyrics_summarized_lemma = []

for summary in tqdm(lyrics_summarized):
    summary = contractions.fix(summary)
    summary = ' '.join([token.lemma_ for token in nlp(summary)])
    
    lyrics_summarized_lemma.append(summary)

100%|██████████| 2000/2000 [00:21<00:00, 91.30it/s] 


In [331]:
lyrics_summarized_lemma = pd.DataFrame(lyrics_summarized_lemma, columns=['sentence'])

In [332]:
# SSWE
lyrics_summarized_sswe = pipeline.fit_transform(lyrics_summarized_lemma)

In [333]:
lyrics_summarized_sswe = lyrics_summarized_sswe.loc[:, "sswe.0":"sswe.149"]

In [334]:
# sBERT
lyrics_summarized_sbert = sbert.encode(list(lyrics_summarized_lemma['sentence']))



In [335]:
lyrics_summarized_sbert = pd.DataFrame(lyrics_summarized_sbert, columns=col_names)

In [336]:
# Concatenate Embeddings
lyrics_summarized_emb = pd.concat([lyrics_summarized_sswe, lyrics_summarized_sbert], axis=1)

In [339]:
lyrics_summarized_emb.to_csv('lyrics_summarized_embeddings.csv')