# The Sound of Emotions
## Features extraction

In [14]:
import pandas as pd
import numpy as np
import re
import urllib.request
import json
import urllib
from urllib.error import HTTPError
import requests

In [7]:
# specify ther filename
file_name = "/data/notebook_files/Sample_Dataset.csv"  # note: you can right-click on your file and copy-paste the path to it here
dataset = pd.read_csv(file_name)
dataset.drop(columns=['User_Email','User_name'], inplace = True)

In [8]:
print(dataset.shape)
dataset.sample(5)

## YouTube Title and ID Extraction
Although the video title can be mined from the URL itself, I found it is better to extract the ID first and then the title.

### Extract the video ID:

In [9]:
def youtube_id_extract(url):
    '''
    The function gets youtube url and returns the video id
    '''
    # Define regex pattern to match video ID
    pattern = re.compile(r"(?:/|v=)([-\w]{11})(?:$|&|\?)")

    # Search for video ID in URL1 using regex
    try:
        match = pattern.search(url)
    except TypeError:
        return None
    if match:
        video_id = match.group(1)
        return video_id

In [10]:
youtube_id_extract('https://youtu.be/RgKAFK5djSk')

In [11]:
dataset['YouTube_ID'] = dataset.Youtube_link.apply(youtube_id_extract)

In [13]:
print(f'The number of songs with no YouTube ID is: {dataset.YouTube_ID.isna().sum()}')
# Drop the rows with no YouTube ID.
dataset.dropna(subset=['YouTube_ID'], inplace = True)

### Extract the video title from the YouTube ID:

In [15]:
def youtube_title_extract(video_id):
    #change to yours VideoID or change url inparams
    params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % video_id}
    url = "https://www.youtube.com/oembed"
    query_string = urllib.parse.urlencode(params)
    url = url + "?" + query_string
    try:
        with urllib.request.urlopen(url) as response:
            response_text = response.read()
            data = json.loads(response_text.decode())
            return data['title']
    except HTTPError:
        return None

In [16]:
youtube_title_extract('VOgFZfRVaww')

In [17]:
dataset['YouTube_Title'] = dataset.YouTube_ID.apply(youtube_title_extract)

In [18]:
print(f'The number of songs with no YouTube Title is: {dataset.YouTube_Title.isna().sum()}')
# Drop the rows with no YouTube ID.
dataset.dropna(subset=['YouTube_Title'], inplace = True)

### Get Spotify's ID:

In [21]:
# Connect The Spotify's API
CLIENT_ID = 'XXXXX'
CLIENT_SECRET = 'XXXXX'

AUTH_URL = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
}, timeout=20)

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']

In [22]:
def get_spotify_id(song_title, access_token):
    # Set up API endpoint and parameters
    endpoint = "https://api.spotify.com/v1/search"
    params = {
        "q": song_title,
        "type": "track",
        "limit": 1
    }
    
    # Set up HTTP headers with access token
    headers = {
        "Authorization": "Bearer " + access_token
    }

    # Make API request and retrieve response
    response = requests.get(endpoint, params=params, headers=headers)

    # Parse response JSON and extract track ID
    try:
        data = response.json()
        if data["tracks"]["items"]:
            track_id = data["tracks"]["items"][0]["id"]
            return track_id
    except HTTPError:
        return None

In [23]:
get_spotify_id('האמיתי', access_token)

In [24]:
dataset['Spotify_ID'] = dataset['YouTube_Title'].apply(lambda x: pd.Series(get_spotify_id(x, access_token)))

In [25]:
dataset.sample(5)

## Get Spotify's Features:

* **Danceability:** Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.  
  
* **Acousticness:** A measure from 0.0 to 1.0 of whether the track is acoustic.  

* **Energy:** Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.  

* **Instrumentalness:** Predicts whether a track contains no vocals. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content.  
  
* **Liveness:** Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live.  
  
* **Loudness:** The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track. Values typical range between -60 and 0 db.  
  
* **Speechiness:** Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value.  
  
* **Tempo:** The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.  
  
* **Valence:** A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

* **Key:**  Represents the musical key of the song (e.g., C major, D minor).  

* **Mode:** Represents the modality of the song (e.g., major or minor). 

In [26]:
def get_audio_features(spotify_id, access_token):
    # Set up API endpoint and parameters
    endpoint = f"https://api.spotify.com/v1/audio-features/{spotify_id}"

    # Set up HTTP headers with access token
    headers = {
        "Authorization": "Bearer " + access_token
    }

    # Make API request and retrieve response
    response = requests.get(endpoint, headers=headers)

    # Parse response JSON and extract relevant audio features
    try:
        data = response.json()
        audio_features = {
            'danceability': data['danceability'],
            'acousticness': data['acousticness'],
            'energy': data['energy'],
            'instrumentalness': data['instrumentalness'],
            'liveness': data['liveness'],
            'loudness': data['loudness'],
            'speechiness': data['speechiness'],
            'tempo': data['tempo'],
            'valence': data['valence'],
            'key': data['key'],
            'mode': data['mode']
        }
        return audio_features
    except:
        return None

In [27]:
get_audio_features('2ZBNclC5wm4GtiWaeh0DMx', access_token)

In [28]:
spotify_features = ['danceability', 'acousticness', 'energy', 'instrumentalness', 'liveness',
                    'loudness', 'speechiness', 'tempo', 'valence', 'key', 'mode']

In [29]:
audio_features = dataset['Spotify_ID'].apply(lambda x: pd.Series(get_audio_features(x, access_token)))
audio_features = audio_features.apply(lambda x: x.fillna(np.nan))
dataset = pd.concat([dataset, audio_features], axis=1)

In [30]:
dataset.sample(5)

## Retrive Artist and Song name, using Spotify's API

In [34]:
def get_track_info(spotify_id, access_token):
    # The Function gets the track name and artist from spotify track id, using the Spotify API
    response = requests.get(
        f'https://api.spotify.com/v1/tracks/{spotify_id}',
        headers={'Authorization': f'Bearer {access_token}'}
    )

    if response.status_code == 200:
        track_name = response.json()['name']
        artist_name = response.json()['artists'][0]['name']
    else:
        print('Error retrieving track information')
        return None

    return {'artist': artist_name, 'song': track_name}

In [None]:
# Apply the function to the DataFrame
dataset[['artist', 'song']] = dataset['spotify_id'].apply(lambda x: pd.Series(get_track_info(x, access_token)))

In [32]:
dataset.sample(5)

### Export the results

In [31]:
FILENAME = 'Full_Dataset_sample.csv'
dataset.to_csv(FILENAME, encoding="utf-8")