In [2]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [9]:
api.dataset_download_files(list_datasets[2]["ref"])

Dataset URL: https://www.kaggle.com/datasets/bricevergnou/spotify-recommendation


In [12]:
import os
import zipfile

import pandas as pd


def get_kaggle_dataset(keyword):
    # List datasets based on the keyword
    list_datasets = api.datasets_list(search=keyword)
    
    if not list_datasets:
        raise ValueError(f"No datasets found for keyword: {keyword}")
    
    # Take the first dataset
    dataset_ref = list_datasets[0]["ref"]
    
    # Extract the dataset name from the ref
    dataset_name = dataset_ref.split("/")[-1]
    
    # Download the dataset
    api.dataset_download_files(dataset_ref)
    
    # Create a directory with the same name as the dataset
    os.makedirs(dataset_name, exist_ok=True)
    
    # Unzip the dataset
    with zipfile.ZipFile(f"{dataset_name}.zip", "r") as zip_ref:
        zip_ref.extractall(dataset_name)
    
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(f"{dataset_name}/data.csv")
    
    return df

# Example usage:
# df = get_kaggle_dataset("spotify")
# print(df.head())



spotify-recommendation


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,liked
0,0.803,0.624,7,-6.764,0,0.0477,0.451,0.000734,0.1,0.628,95.968,304524,4,0
1,0.762,0.703,10,-7.951,0,0.306,0.206,0.0,0.0912,0.519,151.329,247178,4,1
2,0.261,0.0149,1,-27.528,1,0.0419,0.992,0.897,0.102,0.0382,75.296,286987,4,0
3,0.722,0.736,3,-6.994,0,0.0585,0.431,1e-06,0.123,0.582,89.86,208920,4,1
4,0.787,0.572,1,-7.516,1,0.222,0.145,0.0,0.0753,0.647,155.117,179413,4,1


In [5]:
import openai
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def generate_dataset_embedding(dataset_info):
    # Combine relevant information into a single string
    text = f"{dataset_info['title']} {dataset_info['subtitle']} {dataset_info['description']} "
    text += f"Tags: {', '.join([tag['name'] for tag in dataset_info['tags']])}"
    
    # Generate embedding using OpenAI API
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response['data'][0]['embedding']

def build_dataset_embeddings(api, num_datasets=100):
    embeddings = []
    dataset_info = []
    
    for page in range(1, 501):  # Iterate through 500 pages
        datasets = api.datasets_list(page=page)
        
        for dataset in datasets:
            embedding = generate_dataset_embedding(dataset)
            embeddings.append(embedding)
            dataset_info.append({
                'ref': dataset['ref'],
                'title': dataset['title'],
                'subtitle': dataset['subtitle'],
                'url': dataset['url'],
                'creatorName': dataset['creatorName'],
                'tags': [tag['name'] for tag in dataset['tags']]
            })
            
            if len(embeddings) >= num_datasets:
                return embeddings, dataset_info
    
    return embeddings, dataset_info

def find_similar_dataset(query, embeddings, dataset_info):
    # Generate embedding for the query
    query_embedding = openai.Embedding.create(
        input=query,
        model="text-embedding-ada-002"
    )['data'][0]['embedding']
    
    # Calculate cosine similarity
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    
    # Find the index of the most similar dataset
    most_similar_index = np.argmax(similarities)
    
    return dataset_info[most_similar_index]



## saving the embeddings




In [6]:
from openai import OpenAI
from typing import Dict, List
import faiss
import pickle
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import numpy as np
api = KaggleApi()
api.authenticate()

client = OpenAI()


def get_embedding(text: str) -> List[float]:
    """
    Get the embedding for the given text.
    """
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def generate_dataset_embedding(dataset_info):
    text = f"{dataset_info['title']} {dataset_info['subtitle']} {dataset_info['description']} "
    text += f"Tags: {', '.join([tag['name'] for tag in dataset_info['tags']])}"
    
    return get_embedding(text)

def build_and_save_embeddings(api: KaggleApi, save_path: str = 'kaggle_embeddings', start_page: int = 1):
    dataset_info = []
    total_datasets = 0

    print("Starting to build and save embeddings...")

    # Load existing index and dataset info if they exist
    if os.path.exists(f'{save_path}.index') and os.path.exists(f'{save_path}_info.pkl'):
        print("Loading existing index and dataset info...")
        index = faiss.read_index(f'{save_path}.index')
        with open(f'{save_path}_info.pkl', 'rb') as f:
            dataset_info = pickle.load(f)
        total_datasets = len(dataset_info)
        print(f"Loaded {total_datasets} existing datasets.")
    else:
        # Create a new index
        sample_embedding = generate_dataset_embedding(api.datasets_list(page=1)[0])
        dimension = len(sample_embedding)
        index = faiss.IndexFlatL2(dimension)
        print(f"Created new FAISS index with dimension {dimension}")

    for page in range(start_page, 501):  # Process 500 pages
        print(f"Processing page {page} of 500...")
        datasets = api.datasets_list(page=page)
        
        page_embeddings = []
        page_info = []
        
        for dataset in datasets:
            embedding = generate_dataset_embedding(dataset)
            page_embeddings.append(embedding)
            page_info.append({
                'ref': dataset['ref'],
                'title': dataset['title'],
                'subtitle': dataset['subtitle'],
                'url': dataset['url'],
                'description': dataset['description'],
                'tags': [tag['name'] for tag in dataset['tags']]
            })
        
        # Add embeddings to the index
        embeddings_array = np.array(page_embeddings).astype('float32')
        index.add(embeddings_array)
        
        # Update dataset info
        dataset_info.extend(page_info)
        total_datasets += len(page_info)
        
        print(f"Added {len(page_info)} embeddings to the index. Total datasets processed: {total_datasets}")
        
        # Save progress after each page
        print(f"Saving progress...")
        faiss.write_index(index, f'{save_path}.index')
        with open(f'{save_path}_info.pkl', 'wb') as f:
            pickle.dump(dataset_info, f)

    print(f"Finished processing {total_datasets} datasets.")

# Usage: 
build_and_save_embeddings(api)
# To resume from a specific page:
# build_and_save_embeddings(api, start_page=last_processed_page + 1)

Starting to build and save embeddings...
Created new FAISS index with dimension 1536
Processing page 1 of 500...
Added 20 embeddings to the index. Total datasets processed: 20
Saving progress...
Processing page 2 of 500...
Added 20 embeddings to the index. Total datasets processed: 40
Saving progress...
Processing page 3 of 500...
Added 20 embeddings to the index. Total datasets processed: 60
Saving progress...
Processing page 4 of 500...
Added 20 embeddings to the index. Total datasets processed: 80
Saving progress...
Processing page 5 of 500...
Added 20 embeddings to the index. Total datasets processed: 100
Saving progress...
Processing page 6 of 500...
Added 19 embeddings to the index. Total datasets processed: 119
Saving progress...
Processing page 7 of 500...
Added 19 embeddings to the index. Total datasets processed: 138
Saving progress...
Processing page 8 of 500...
Added 19 embeddings to the index. Total datasets processed: 157
Saving progress...
Processing page 9 of 500...
Add

In [8]:
# Load the FAISS index
index_path = 'kaggle_embeddings.index'  # Adjust this path if necessary
loaded_index = faiss.read_index(index_path)

# Check the number of datapoints in the index
num_datapoints = loaded_index.ntotal

print(f"The FAISS index contains {num_datapoints} datapoints.")


The FAISS index contains 9978 datapoints.


## test of query

In [12]:
import pandas as pd
import zipfile

def find_similar_datasets(query: str, index: faiss.Index, dataset_info: List[Dict], k: int = 5) -> List[Dict]:
    query_embedding = get_embedding(query)
    
    # Convert query embedding to numpy array
    query_array = np.array([query_embedding]).astype('float32')
    
    # Perform similarity search
    distances, indices = index.search(query_array, k)
    
    return [dataset_info[i] for i in indices[0]]


def find_and_prepare_kaggle_dataset(treatment: str, outcome: str, dag_variables: List[str], index: faiss.Index, dataset_info: List[Dict]) -> pd.DataFrame:
    query = f"Dataset for analyzing the effect of {treatment} on {outcome}, considering variables: {', '.join(dag_variables)}"
    
    similar_datasets = find_similar_datasets(query, index, dataset_info)
    
    print("Top 5 similar datasets:")
    for i, dataset in enumerate(similar_datasets, 1):
        print(f"{i}. {dataset['ref']}: {dataset['title']}")
        print(f"   Description: {dataset['description'][:100]}...")  # Show first 100 characters of description
    
    while True:
        try:
            choice = int(input("\nChoose a dataset (1-5) or 0 to cancel: "))
            if 0 <= choice <= 5:
                break
            else:
                print("Please enter a number between 0 and 5.")
        except ValueError:
            print("Please enter a valid number.")
    
    if choice == 0:
        print("Operation cancelled by the user.")
        return None
    
    selected_dataset = similar_datasets[choice - 1]
    dataset_ref = selected_dataset['ref']
    
    print(f"\nSelected dataset: {dataset_ref}")
    print(f"Dataset description: {selected_dataset['description']}")
    
    print("Proceeding with dataset download and processing...")

    api.dataset_download_files(dataset_ref)
    
    dataset_name = dataset_ref.split("/")[-1]
    
    with zipfile.ZipFile(f"{dataset_name}.zip", "r") as zip_ref:
        zip_ref.extractall(dataset_name)
    
    df = pd.read_csv(f"{dataset_name}/data.csv")
    
    variable_mapping = map_variables(dag_variables, df.columns.tolist())
    
    relevant_columns = [col for col in variable_mapping.values() if col != 'None']
    df_filtered = df[relevant_columns]
    
    rename_dict = {v: k for k, v in variable_mapping.items() if v != 'None'}
    df_renamed = df_filtered.rename(columns=rename_dict)
    
    return df_renamed


# Test the find_and_prepare_kaggle_dataset function

# Load the dataset info
with open('kaggle_embeddings_info.pkl', 'rb') as f:
    dataset_info = pickle.load(f)

# Define test parameters
treatment = "danceability"
outcome = "liked"
dag_variables = ["energy", "loudness", "tempo", "valence"]

# Call the function
df_result = find_and_prepare_kaggle_dataset(treatment, outcome, dag_variables, loaded_index, dataset_info)

# Display the first few rows and basic information about the resulting dataframe
print(df_result.head())
print("\nDataframe info:")
print(df_result.info())

# Check if all specified variables are present in the result
all_variables = [treatment, outcome] + dag_variables
missing_variables = [var for var in all_variables if var not in df_result.columns]

if missing_variables:
    print(f"\nWarning: The following variables are missing from the result: {', '.join(missing_variables)}")
else:
    print("\nAll specified variables are present in the result.")


Top 5 similar datasets:
1. yasaminjafarian/tiktokdataset: TikTokDataset
   Description: ...
2. imsparsh/deam-mediaeval-dataset-emotional-analysis-in-music: DEAM Dataset - Emotional Analysis in Music
   Description: ...
3. meeraajayakumar/spotify-user-behavior-dataset: Spotify User Behavior Dataset
   Description: ...
4. mahdiehhajian/women-clothes: women clothes👗🥻
   Description: ...
5. uldisvalainis/audio-emotions: Audio emotions
   Description: ...

Selected dataset: meeraajayakumar/spotify-user-behavior-dataset
Dataset description: 
Proceeding with dataset download and processing...
Dataset URL: https://www.kaggle.com/datasets/meeraajayakumar/spotify-user-behavior-dataset


NameError: name 'zipfile' is not defined