In [1]:
import os

In [2]:
import shutil

In [3]:
def process_and_copy_files(source_dir, target_dir):
     
    #Processes and copies text files from the source directory to the target directory.

    #This function performs the following steps:
   # 1. Checks if the target directory exists; if not, it creates it.
   # 2. Iterates through all files in the source directory and its subdirectories.
   # 3. Identifies files with a ".txt" extension.
   # 4. Reads the content of each text file, removes the first line (assumed to be metadata),
   #    and writes the remaining content to a new file in the target directory.

   # Args:
   #     source_dir (str): The path to the source directory containing the text files.
   #     target_dir (str): The path to the target directory where processed files will be saved.

   # Returns:
   #     None
    
    if not os.path.exists(target_dir):
        print(f"Creating target directory: {target_dir}")
        os.makedirs(target_dir)
    else:
        print(f"Target directory already exists: {target_dir}")
    
    for root, _, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".txt"):
                source_path_raw = os.path.join(root, file)
                target_path_raw = os.path.join(target_dir, file)
                with open(source_path_raw, "r", encoding="utf-8") as source_file:
                    lines = source_file.readlines()
                # Remove the first line (metadata) and save the rest
                processed_lines = lines[1:]
                with open(target_path_raw, "w", encoding="utf-8") as target_file:
                    target_file.writelines(processed_lines)

In [4]:
import random

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
def extract_words_from_files(target_dir, num_files=100, num_words=200):
    
  #  Extracts a limited number of words from a random selection of text files in the target directory.

 #   This function performs the following steps:
 #   1. Collects all text files in the specified target directory.
 #   2. Randomly selects a specified number of files (`num_files`) or fewer if there are not enough files.
 #   3. For each selected file:
 #       - Reads the content of the file.
 #       - Skips the file if it is empty.
 #       - Extracts the first `num_words` words from the file's content.
 #       - Stores the extracted words along with the file's title (derived from the filename) in a list.
 #   4. Returns the list of tuples, where each tuple contains the file title and the extracted text.

 #   Args:
 #       target_dir (str): The path to the directory containing the text files.
 #       num_files (int, optional): The maximum number of files to process. Defaults to 200.
 #       num_words (int, optional): The maximum number of words to extract from each file. Defaults to 200.

 #   Returns:
 #       list: A list of tuples, where each tuple contains:
 #             - The title of the file (str).
 #             - The extracted text (str) limited to `num_words` words.
    
    
    # Define words to be deleted
    words_for_delete = {"[Pre-Chorus]", "[Verse 2]", "[Verse 1]", "[Chorus]", "[Post-Chorus]", "[Breakdown]", "[Outro]", "[Bridge]"}
    
    # Collect all text files in the target directory
    all_files = [os.path.join(target_dir, file) for file in os.listdir(target_dir) if file.endswith(".txt")]
    
    # Randomly select files
    selected_files = random.sample(all_files, min(num_files, len(all_files)))
    
    extracted_data = [] 
    
    for file_path in selected_files:
        song_title = os.path.splitext(os.path.basename(file_path))[0]
        
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            
            # Skip empty files
            if not content.strip():
                continue
            
            # Extract words, excluding specified words
            words = [word for word in content.split() if word not in words_for_delete]
            
           # print(words)
            
            # Limit to the first `num_words`
            extracted_text = " ".join(words[:num_words])
           # print(extracted_text)
            # Append the result to the list
            extracted_data.append((song_title, extracted_text))
            #print(extracted_data)
            
    print(f"Processed {len(extracted_data)} files.")
    for title, text in extracted_data[:5]:  # Print only the first 5 entries
        print(f"Title: {title}, Extracted Text: {text[:50]}...")  # Print a snippet of the text
         
    
    return extracted_data

In [7]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [8]:
def generate_vectors(text_data, model):
    #Generates vector representations for a collection of song texts using a given model.

   # This function performs the following steps:
   # 1. Extracts song texts and their corresponding titles from the input data.
   # 2. Uses the provided model to encode the song texts into vector representations.
   # 3. Combines the song titles with their corresponding vectors into a list of tuples.
   # 4. Returns the list of tuples, where each tuple contains a song title and its vector.

    #Args:
   #     text_data (list): A list of tuples, where each tuple contains:
    #                      - The title of the song (str).
     #                     - The text of the song (str).
   #     model (object): A model with an `encode` method that generates vector representations
    #                    for a list of texts. The `encode` method should support the following parameters:
    #                    - `batch_size` (int): The number of texts to process in a single batch.
     #                   - `show_progress_bar` (bool): Whether to display a progress bar during encoding.

    #Returns:
    #    list: A list of tuples, where each tuple contains:
    #          - The title of the song (str).
    #          - The vector representation of the song text (numpy array or similar format).
    # 
    song_texts = [text for _, text in text_data]
    song_titles = [title for title, _ in text_data]
    
    # Generate vectors for the song texts
    vectors = model.encode(song_texts, batch_size=16, show_progress_bar=True)
    
    # Combine song titles with their corresponding vectors
    song_vectors = [(title, vector) for title, vector in zip(song_titles, vectors)]
    
    return song_vectors

In [9]:
import faiss

In [10]:
import numpy as np

In [11]:
def create_faiss_index_with_titles(song_vectors):
    
   
    titles = [item[0] for item in song_vectors]
    vectors = np.array([item[1] for item in song_vectors]).astype("float32")  # Ensure correct dtype

    # Normalize the vectors
    vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

    # Create the FAISS index
    dimension = vectors.shape[1]  # Vector dimension
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    index.add(vectors)  # Add normalized vectors to the index

    return {"index": index, "titles": titles}

In [13]:
def search_songs_by_word_or_phrase(query, extracted_data, faiss_index, model, top_k=10, similarity_threshold=0.1):
    
   # Search for the top K most similar songs to the query using FAISS.

  #  Args:
   #     query (str): The word or phrase to search for.
   #     extracted_data (list): The original dataset of song titles and texts (not used here but kept for reference).
    #    faiss_index (dict): A dictionary containing the FAISS index and song titles.
   #     model (SentenceTransformer): The model used to encode the query into a vector.
    #    top_k (int): The number of top results to return.
    #    similarity_threshold (float): The minimum similarity score to consider a result valid.

   # Returns:
   #     list: A list of tuples containing the song title and similarity score for valid matches.
   
    # Generate a vector for the query
    query_vector = model.encode([query], show_progress_bar=False)[0]

    # Normalize the query vector
    query_vector = query_vector / np.linalg.norm(query_vector)

    # Search for the top K most similar songs in the FAISS index
    distances, indices = faiss_index["index"].search(np.array([query_vector]).astype("float32"), top_k)

    

    # Map the FAISS indices back to the song titles and filter by similarity threshold
    results = []
    words_for_delete = {""}
    
    for i in range(len(indices[0])):
        faiss_title = faiss_index["titles"][indices[0][i]]
        
        similarity = 1 - distances[0][i]  # Convert L2 distance to similarity (1 - distance)
       

       # if similarity >= similarity_threshold:  # Only include results above the threshold
       #     results.append((faiss_title, similarity))
            
        results.append((faiss_title, similarity))
    return results

In [14]:
source_dir = "../raw_data/Taylor-Swift-Lyrics/data/Albums"

In [15]:
target_dir = "../data"

In [16]:
process_and_copy_files(source_dir, target_dir)

Target directory already exists: ../data


In [18]:
extracted_data = extract_words_from_files(target_dir)


Processed 99 files.
Title: SayDontGo_TaylorsVersion__FromTheVault_, Extracted Text: I've known it from the very start We’re a shot in ...
Title: Question____, Extracted Text: I remember [Verse 1] Good girl, sad boy Big city, ...
Title: TheLuckyOne_TaylorsVersion_, Extracted Text: New to town with a made-up name In the angel's cit...
Title: Breathe_TaylorsVersion_, Extracted Text: I see your face in my mind as I drive away 'Cause ...
Title: AllTooWell, Extracted Text: I walked through the door with you, the air was co...


In [None]:
song_vectors = generate_vectors(extracted_data, model)


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [41]:
faiss_index = create_faiss_index_with_titles(song_vectors)

In [42]:
query = """Because I'm proud of it as a song"""

In [43]:
similarity_threshold = 0.1

In [44]:
top_k = 10

In [45]:
results = search_songs_by_word_or_phrase(query, extracted_data, faiss_index, model, top_k, similarity_threshold)


In [46]:
if results:
    print(f"Top {top_k} songs containing the word/phrase '{query}':")
    for title, similarity in results:
        print(f"Title: {title}, Similarity: {similarity:.4f}")
else:
    print(f"No songs found containing the word/phrase: '{query}'")

Top 10 songs containing the word/phrase 'Because I'm proud of it as a song':
Title: WenchRench, Similarity: -0.4162
Title: TheGods, Similarity: -0.4316
Title: hoax, Similarity: -0.4330
Title: SnowOnTheBeach, Similarity: -0.4385
Title: NewRomantics_TaylorsVersion_, Similarity: -0.4517
Title: IKnewYouWereTrouble_TaylorsVersion_, Similarity: -0.4533
Title: CallItWhatYouWant, Similarity: -0.4534
Title: WeWereHappy_TaylorsVersion__FromtheVault_, Similarity: -0.4564
Title: 22, Similarity: -0.4590
Title: Change, Similarity: -0.4705
