In [1]:
source_dir = "../raw_data/Taylor-Swift-Lyrics/data/Albums"

In [2]:
target_dir = "../data"

In [3]:
import os

In [4]:
import shutil

In [5]:
def process_and_copy_files(source_dir, target_dir):
    for root, _, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".txt"):
                source_path_raw = os.path.join(root, file)
                target_path_raw = os.path.join(target_dir, file)
                with open(source_path_raw, "r", encoding="utf-8") as source_file:
                    lines = source_file.readlines()
                processed_lines = lines[1:]
                with open(target_path_raw, "w", encoding="utf-8") as target_file:
                    target_file.writelines(processed_lines)
            


        

In [6]:
process_and_copy_files(source_dir, target_dir)

In [7]:
import random


In [8]:
def extract_words_from_files(target_dir, num_files=100, num_words=200):    
    all_files = [os.path.join(target_dir, file) for file in os.listdir(target_dir) if file.endswith(".txt")]  
    selected_files = random.sample(all_files, min(num_files, len(all_files)))
    extracted_data =   [] 
    for file_path in selected_files:        
        song_title = os.path.splitext(os.path.basename(file_path))[0]        
        
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            words = content.split()            
            
            extracted_text = " ".join(words[:num_words])            
            
            extracted_data.append((song_title, extracted_text))
            
    return extracted_data
    
    

In [9]:
from sentence_transformers import SentenceTransformer

In [10]:
def generate_vectors(text_data, model_name="all-MiniLM-L6-v2"):
    # Initialize the SentenceTransformer model
    model = SentenceTransformer(model_name)
    
    # Check if text_data is a list of tuples or just a list of strings
    if isinstance(text_data[0], tuple):
        # Extract only the song texts for encoding
        song_texts = [text for _, text in text_data]
        song_titles = [title for title, _ in text_data]
    else:
        # Assume text_data is a list of strings (no titles provided)
        song_texts = text_data
        song_titles = [f"Song_{i}" for i in range(len(text_data))]  # Generate generic titles
    
    # Generate vectors for the song texts
    vectors = model.encode(song_texts)
    
    # Combine song titles with their corresponding vectors
    song_vectors = [(title, vector) for title, vector in zip(song_titles, vectors)]
    
    return song_vectors

In [11]:
import faiss

In [12]:
import numpy as np

In [13]:
def create_faiss_index_with_titles(song_vectors):
    
    
    titles = [item[0] for item in song_vectors]
    vectors = np.array([item[1] for item in song_vectors])

   
    dimension = vectors.shape[1] 
    index = faiss.IndexFlatL2(dimension)  
    index.add(vectors)  

    return {"index": index, "titles": titles}






    

In [14]:
def search_songs_by_word_or_phrase(query, extracted_data, faiss_index, top_k=10):
    
    # Filter songs containing the query word/phrase
    filtered_songs = [(title, text) for title, text in extracted_data if query.lower() in text.lower()]

    if not filtered_songs:
        return f"No songs found containing the word/phrase: '{query}'"

    # Extract the titles and texts of the filtered songs
    filtered_titles = [title for title, _ in filtered_songs]
    filtered_texts = [text for _, text in filtered_songs]

    # Generate vectors for the filtered texts
    model = SentenceTransformer("all-MiniLM-L6-v2")
    filtered_vectors = model.encode(filtered_texts)

    # Search for the top K most similar songs in the FAISS index
    distances, indices = faiss_index["index"].search(np.array(filtered_vectors), top_k)

    # Map the FAISS indices back to the song titles
    results = []
    for i, title in enumerate(filtered_titles):
        for j in range(top_k):
            faiss_title = faiss_index["titles"][indices[i][j]]
            similarity = distances[i][j]
            results.append((faiss_title, similarity, filtered_texts[i]))

    # Sort results by similarity (lower distance is better)
    results = sorted(results, key=lambda x: x[1])

    return results[:top_k]


In [15]:
extracted_data = extract_words_from_files(target_dir)

In [16]:
song_vectors = generate_vectors(extracted_data)

In [17]:
faiss_index = create_faiss_index_with_titles(song_vectors)

In [18]:
top_k = 10


In [19]:
query = "smile"

In [None]:
results = search_songs_by_word_or_phrase(query, extracted_data, faiss_index, top_k)