In [132]:
# Import required libraries
import pandas as pd
import numpy as np
import datetime
import re
import unicodedata
import warnings
pd.options.display.max_columns = None
pd.set_option('max_rows', 5000)
pd.options.display.float_format = '{:,.2f}'.format 
warnings.filterwarnings('ignore')

# **Upload Spotify Dataset**

In [133]:
# Read spotify dataset
spotify = pd.read_csv('spotify_practica.csv')
print(spotify.shape)
spotify.head(5)

(170349, 20)


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,zip Code,genero
0,0.76,['Shel Silverstein'],0.67,210387,0.17,,1r3N7MoEGPHJyCjBlWCsx3,0.0,2.0,0.13,-13.07,1,The Unicorn,12.0,1962-10-13 00:00:00,0.33,97.08,0.33,38ee029,
1,0.92,['Mose Allison'],0.69,149547,0.24,,2RL8EkOOu0QhAikhUUvIRi,0.0,,0.08,-14.61,0,If You Live,19.0,1963-04-14 00:00:00,0.04,120.89,0.71,9p2304,
2,0.93,['水柳仙'],0.18,207173,0.09,0.0,3kKKEghp2EMh8992pWFylr,0.7,7.0,0.11,-26.6,1,鴛鴦夢,23.0,1963-11-14 00:00:00,0.04,101.48,0.13,2166ee6,
3,0.98,['Dean Martin'],0.47,182653,0.03,,0j3ideT0tRuIyRhOOUQjYa,0.0,7.0,0.15,-18.01,1,Fools Rush In,21.0,1964-06-07 00:00:00,0.04,135.56,0.19,67t06,
4,0.89,['Doris Day'],0.24,171400,0.34,,0od9zbD6Bcc8b7dCMRFWBZ,0.0,2.0,0.13,-9.81,0,The Christmas Waltz,22.0,1964-08-07 00:00:00,0.03,89.16,0.24,804p53,


In [134]:
# Identify type of variables in "spotify DataFrame"
spotify.dtypes

acousticness        float64
artists              object
danceability        float64
duration_ms           int64
energy              float64
explicit            float64
id                   object
instrumentalness    float64
key                 float64
liveness            float64
loudness            float64
mode                  int64
name                 object
popularity          float64
release_date         object
speechiness         float64
tempo               float64
valence             float64
zip Code             object
genero               object
dtype: object

## **(a)** **Tagging Variables By Type**

In [135]:
# Prefixes for variable types
# 'c_' --> Numeric Variables: Discrete & Continous
# 'v_' --> Categorical Variables
# 'd_' --> Date Type Variables
# 't_' --> Text Type Variables

c_feats = ['acousticness','danceability','duration_ms','energy',
           'instrumentalness','liveness','loudness',
           'speechiness','tempo','valence']
v_feats = ['mode','key','explicit','popularity','genero']
t_feats = ['artists','id','name','zip Code']
d_feats = ['release_date']

#cambiar genero a categorica

In [136]:
c_feats_new = ['c_' + x for x in c_feats]
v_feats_new = ['v_' + x for x in v_feats]
d_feats_new = ['d_' + x for x in d_feats]
t_feats_new = ['t_' + x for x in t_feats]

In [137]:
print(list(c_feats))
print(list(c_feats_new))

['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
['c_acousticness', 'c_danceability', 'c_duration_ms', 'c_energy', 'c_instrumentalness', 'c_liveness', 'c_loudness', 'c_speechiness', 'c_tempo', 'c_valence']


In [138]:
# Rename columns according to the type of variable
spotify.rename(columns=dict(zip(d_feats,d_feats_new)),inplace=True)
spotify.rename(columns=dict(zip(v_feats,v_feats_new)),inplace=True)
spotify.rename(columns=dict(zip(t_feats,t_feats_new)),inplace=True)
spotify.rename(columns=dict(zip(c_feats,c_feats_new)),inplace=True)
print(spotify.shape)
spotify.head(3)

(170349, 20)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,v_explicit,t_id,c_instrumentalness,v_key,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.76,['Shel Silverstein'],0.67,210387,0.17,,1r3N7MoEGPHJyCjBlWCsx3,0.0,2.0,0.13,-13.07,1,The Unicorn,12.0,1962-10-13 00:00:00,0.33,97.08,0.33,38ee029,
1,0.92,['Mose Allison'],0.69,149547,0.24,,2RL8EkOOu0QhAikhUUvIRi,0.0,,0.08,-14.61,0,If You Live,19.0,1963-04-14 00:00:00,0.04,120.89,0.71,9p2304,
2,0.93,['水柳仙'],0.18,207173,0.09,0.0,3kKKEghp2EMh8992pWFylr,0.7,7.0,0.11,-26.6,1,鴛鴦夢,23.0,1963-11-14 00:00:00,0.04,101.48,0.13,2166ee6,


# **Delete Duplicates**

## **(b)** **Approach: Delete Duplicates Using: "duplicated()"**

In [139]:
spotify.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
170344    False
170345    False
170346    False
170347    False
170348    False
Length: 170349, dtype: bool

In [140]:
spotify[spotify.duplicated()]

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,v_explicit,t_id,c_instrumentalness,v_key,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
1668,1.0,"['Alexander Scriabin', 'Vladimir Horowitz']",0.32,326067,0.15,0.0,6T0mZB7p3qzOifdXZ3Su9P,0.92,4.0,0.13,-18.73,1,"Sonata No. 3, Op. 23 in F-Sharp Minor: IV. Pre...",0.0,1928-07-08 00:00:00,0.04,66.95,0.05,12935,
1669,0.73,['Seweryn Goszczyński'],0.72,100700,0.29,0.0,6qxoAGlWUrVYbvq3x6S08C,0.0,,0.29,-15.86,1,Chapter 2.23 - Zamek kaniowski,0.0,1928-04-01 00:00:00,0.94,114.06,0.69,7058,
1670,0.78,['The Merlons'],0.43,128707,0.53,,41wmjlc9ChMBHZ8fB0btdM,0.98,,0.14,-16.68,1,Loda,0.0,1930-05-06 00:00:00,0.03,110.99,0.35,38879,
1681,1.0,"['Alexander Scriabin', 'Vladimir Horowitz']",0.32,326067,0.15,0.0,6T0mZB7p3qzOifdXZ3Su9P,0.92,4.0,0.13,-18.73,1,"Sonata No. 3, Op. 23 in F-Sharp Minor: IV. Pre...",0.0,1928-07-08 00:00:00,0.04,66.95,0.05,12935,
1683,0.73,['Seweryn Goszczyński'],0.72,100700,0.29,0.0,6qxoAGlWUrVYbvq3x6S08C,0.0,,0.29,-15.86,1,Chapter 2.23 - Zamek kaniowski,0.0,1928-04-01 00:00:00,0.94,114.06,0.69,7058,
1686,0.78,['The Merlons'],0.43,128707,0.53,,41wmjlc9ChMBHZ8fB0btdM,0.98,,0.14,-16.68,1,Loda,0.0,1930-05-06 00:00:00,0.03,110.99,0.35,38879,
167836,0.99,['Carl Woitschach'],0.71,158648,0.2,,6KbQ3uYMLKb5jDxLF7wYDD,0.56,10.0,0.15,-12.43,1,Singende Bataillone 1. Teil,0.0,1928-05-05 00:00:00,0.05,118.47,0.78,98638,20.0
167837,0.99,"['Robert Schumann', 'Vladimir Horowitz']",0.38,282133,0.01,,6KuQTIu1KoTTkLXKrwlLPV,0.9,8.0,0.08,-28.45,1,"Fantasiestücke, Op. 111: Più tosto lento",0.0,1928-11-13 00:00:00,0.05,83.97,0.08,81655,48.0
167838,0.6,['Seweryn Goszczyński'],0.75,104300,0.22,0.0,6L63VW0PibdM1HDSBoqnoM,0.0,5.0,0.12,-19.92,0,Chapter 1.18 - Zamek kaniowski,0.0,1928-07-26 00:00:00,0.93,107.18,0.88,47232,16.0
167839,0.99,['Francisco Canaro'],0.78,180760,0.13,0.0,6M94FkXd15sOAOQYRnWPN8,0.89,1.0,0.11,-14.73,0,Bebamos Juntos - Instrumental (Remasterizado),0.0,1928-04-13 00:00:00,0.09,108.0,0.72,75605,20.0


In [141]:
spotify.drop_duplicates(inplace = True)

In [142]:
# Verify if duplicate records were deleted
spotify.duplicated().sum()

0

In [143]:
# Create a new index and delete the previous one
spotify.reset_index(drop = True, inplace = True)

In [144]:
# Verify dataframe shape after remove duplicates
print(spotify.shape)
spotify.head(5)

(169985, 20)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,v_explicit,t_id,c_instrumentalness,v_key,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.76,['Shel Silverstein'],0.67,210387,0.17,,1r3N7MoEGPHJyCjBlWCsx3,0.0,2.0,0.13,-13.07,1,The Unicorn,12.0,1962-10-13 00:00:00,0.33,97.08,0.33,38ee029,
1,0.92,['Mose Allison'],0.69,149547,0.24,,2RL8EkOOu0QhAikhUUvIRi,0.0,,0.08,-14.61,0,If You Live,19.0,1963-04-14 00:00:00,0.04,120.89,0.71,9p2304,
2,0.93,['水柳仙'],0.18,207173,0.09,0.0,3kKKEghp2EMh8992pWFylr,0.7,7.0,0.11,-26.6,1,鴛鴦夢,23.0,1963-11-14 00:00:00,0.04,101.48,0.13,2166ee6,
3,0.98,['Dean Martin'],0.47,182653,0.03,,0j3ideT0tRuIyRhOOUQjYa,0.0,7.0,0.15,-18.01,1,Fools Rush In,21.0,1964-06-07 00:00:00,0.04,135.56,0.19,67t06,
4,0.89,['Doris Day'],0.24,171400,0.34,,0od9zbD6Bcc8b7dCMRFWBZ,0.0,2.0,0.13,-9.81,0,The Christmas Waltz,22.0,1964-08-07 00:00:00,0.03,89.16,0.24,804p53,


## **(c)** **Data Completeness**

In [145]:
# Function used to get completeness values
# The input/argument is --> spotify
def completeness(dataframe):
    comp = pd.DataFrame(dataframe.isnull().sum())
    comp.reset_index(inplace = True)
    comp = comp.rename(columns = {'index':'column', 0:'total'})
    comp['completeness'] = (1 - comp['total']/dataframe.shape[0])*100
    comp = comp.sort_values(by = 'completeness', ascending = True)
    comp.reset_index(drop = True, inplace = True)
    return comp

In [146]:
# Apply "completeness function" to spotify dataframe
completeness(spotify)

Unnamed: 0,column,total,completeness
0,v_explicit,62905,62.99
1,v_key,35759,78.96
2,v_popularity,3501,97.94
3,v_genero,1698,99.0
4,c_valence,0,100.0
5,c_tempo,0,100.0
6,c_speechiness,0,100.0
7,d_release_date,0,100.0
8,t_name,0,100.0
9,v_mode,0,100.0


## **(d)** **Delete Variables With >=20% of Missing Value**

In [147]:
# Drop columns with 20% or more missing values
spotify.drop(columns = ['v_explicit', 'v_key'], inplace = True)
spotify.reset_index(drop = True, inplace = True)
print(spotify.shape)
spotify.head(5)

(169985, 18)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.76,['Shel Silverstein'],0.67,210387,0.17,1r3N7MoEGPHJyCjBlWCsx3,0.0,0.13,-13.07,1,The Unicorn,12.0,1962-10-13 00:00:00,0.33,97.08,0.33,38ee029,
1,0.92,['Mose Allison'],0.69,149547,0.24,2RL8EkOOu0QhAikhUUvIRi,0.0,0.08,-14.61,0,If You Live,19.0,1963-04-14 00:00:00,0.04,120.89,0.71,9p2304,
2,0.93,['水柳仙'],0.18,207173,0.09,3kKKEghp2EMh8992pWFylr,0.7,0.11,-26.6,1,鴛鴦夢,23.0,1963-11-14 00:00:00,0.04,101.48,0.13,2166ee6,
3,0.98,['Dean Martin'],0.47,182653,0.03,0j3ideT0tRuIyRhOOUQjYa,0.0,0.15,-18.01,1,Fools Rush In,21.0,1964-06-07 00:00:00,0.04,135.56,0.19,67t06,
4,0.89,['Doris Day'],0.24,171400,0.34,0od9zbD6Bcc8b7dCMRFWBZ,0.0,0.13,-9.81,0,The Christmas Waltz,22.0,1964-08-07 00:00:00,0.03,89.16,0.24,804p53,


## **(e)** **How many records in the variable "zip Code" are invalid values? That is, they contain letters**

In [148]:
# Using regex to identify invalid records in 't_zip Code' variable (it means that contains letters)
invalid_zipcode = spotify['t_zip Code'].str.contains(r'[a-zA-Z]').sum()
print(f'Total Invalid Zip Code Records: {invalid_zipcode}')
print(f'Total Records: {spotify.shape[0]}')

Total Invalid Zip Code Records: 4341
Total Records: 169985


## **(f)** **Delete records that don't have a valid "zip Code", that is, it contains letters in the values**

In [149]:
spotify['t_zip Code'] = spotify['t_zip Code'].astype('unicode')
spotify = spotify[spotify['t_zip Code'].map(lambda x: x.isnumeric())]
spotify.reset_index(drop = True, inplace = True)
print(spotify.shape)
spotify.head(3)

(165644, 18)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,Crypts of Eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,Mi Sombra En la Pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,Knocked Out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,


## **(g)** **How many records in the variable "gender" are invalid values? , that is, they contain letters**

In [150]:
# Using regex to identify letters in 'v_genero'
invalid_gender = spotify['v_genero'].str.contains(r'[a-zA-Z-ZéüöêåøЧастьХемиуэйЧасть]').sum()
print(f'Total Records With Invalid Gender: {invalid_gender}')
print(f'Total Records: {spotify.shape[0]}')

Total Records With Invalid Gender: 10
Total Records: 165644


## **(h)** **Delete the records that don't have a valid "gender", that is, that contain letters in the values**

In [151]:
spotify.head(3)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,Crypts of Eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,Mi Sombra En la Pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,Knocked Out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,


In [152]:
spotify = spotify[~spotify['v_genero'].str.contains(r'[a-zA-Z-ZéüöêåøЧастьХемиуэйЧасть]', na=False)]
print(spotify.shape)
spotify.head(3)

(165634, 18)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,Crypts of Eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,Mi Sombra En la Pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,Knocked Out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,


## **(i)** **Clean the variable "name", remove special characters and everything must be in lowercase**

In [153]:
# This functions just map the column 'name' to keep names even if they have asian characters
import re
def cjk_detect(texts):
    # Korean
    if re.search("[\uac00-\ud7a3]", texts):
        return texts
    # Japanese
    if re.search("[\u3040-\u30ff]", texts):
        return texts
    # Chinese
    if re.search("[\u4e00-\u9FFF]", texts):
        return texts
    else:
        return texts

In [154]:
# This functions is used to remove special marks/characters
def remove_punct(text):
    try:
        text=text.replace(".",' ').replace(";",' ').replace(":",' ').replace(",",' ')
        text=text.replace("(",' ').replace(")",' ').replace("|",' ').replace('"',' ')
        text=text.replace("%",' ').replace("$",' ').replace("/",' ').replace('\'',' ')
        text=text.replace("-",' ').replace("_",' ').replace("*",' ').replace('+',' ')
        text=text.replace("#",' ').replace("@",' ').replace("!",' ').replace('?',' ')
        text=text.replace("[",' ').replace("]",' ').replace("'",' ').replace('¡',' ')
    except:
        pass
    return text 

In [155]:
# This functions is used to clean 'name' and convert to lowercase
def clean_text(text):
    text=text.lower()
    text=remove_punct(text)
    return text

In [156]:
# Apply 'cjk_detect' function just to return text in its native format
spotify['t_name'] = spotify['t_name'].apply(lambda row: cjk_detect(row))
spotify.reset_index(drop = True, inplace = True)
spotify.head(5)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,Crypts of Eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,Mi Sombra En la Pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,Knocked Out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,Zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,Walkin' Blues - Acoustic; Live at MTV Unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,


In [157]:
# Apply 'clean_text' function to convert it to lowercase without affect (Japanese, Chinese & Korean) names
spotify['t_name'] = spotify['t_name'].apply(lambda row: clean_text(row))
spotify.reset_index(drop = True, inplace = True)
spotify.head(5)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,


## **(j)** **From the variable "artist" select only the first one that appears in the list in addition eliminate special characters**

In [158]:
first_artist = spotify['t_artists'].iloc[0]
clean_first_artist = clean_text(first_artist)
clean_first_artist = remove_punct(first_artist)
print(f'No Clean First Artist: {first_artist}')
print(f'Clean First Artist: {clean_first_artist}')

No Clean First Artist: ['Slayer']
Clean First Artist:   Slayer  


## **(k)** **Normalize the variable "gender" in such a way that you get only 8 categories**

In [159]:
print(spotify.shape)

(165634, 18)


In [160]:
# Checking for null values in 'v_genero' variable
spotify['v_genero'].isnull().sum()

1642

In [161]:
spotify['v_genero'].value_counts()

47.0    27168
20.0    24872
50.0    20154
12.0    18329
16.0    18067
21.0    17834
47.0     5812
20.0     5398
50.0     4247
21.0     3899
16.0     3843
12.0     3774
30.0     2653
49.0     2627
48.0     2566
30.0      579
49.0      566
48.0      551
3.0       448
2.0       286
1.0       147
3.0        84
2.0        65
1.0        23
Name: v_genero, dtype: int64

In [162]:
spotify['v_genero'].value_counts(normalize=True)

47.0   0.17
20.0   0.15
50.0   0.12
12.0   0.11
16.0   0.11
21.0   0.11
47.0   0.04
20.0   0.03
50.0   0.03
21.0   0.02
16.0   0.02
12.0   0.02
30.0   0.02
49.0   0.02
48.0   0.02
30.0   0.00
49.0   0.00
48.0   0.00
3.0    0.00
2.0    0.00
1.0    0.00
3.0    0.00
2.0    0.00
1.0    0.00
Name: v_genero, dtype: float64

In [163]:
spotify['v_genero'].value_counts(1)[-4].sum()

0.0008963851895214401

In [164]:
spotify['v_genero'].value_counts(1)[-3:].sum()

0.001048831650324406

In [165]:
spotify_genero_norm = dict(zip(list(spotify['v_genero'].value_counts(1)[-17:].index),['Others']*90))
spotify_genero_norm

{'20.0': 'Others',
 '50.0': 'Others',
 '21.0': 'Others',
 '16.0': 'Others',
 '12.0': 'Others',
 30.0: 'Others',
 49.0: 'Others',
 48.0: 'Others',
 '30.0': 'Others',
 '49.0': 'Others',
 '48.0': 'Others',
 3.0: 'Others',
 2.0: 'Others',
 1.0: 'Others',
 '3.0': 'Others',
 '2.0': 'Others',
 '1.0': 'Others'}

In [166]:
spotify['v_genero'].replace(spotify_genero_norm).value_counts()

Others    31756
47.0      27168
20.0      24872
50.0      20154
12.0      18329
16.0      18067
21.0      17834
47.0       5812
Name: v_genero, dtype: int64

## **(l)** **Add the following columns to your dataset: zip, lat, lng, city, state name using the zips table**

In [167]:
# Read 'zips_practica.csv' data table
zips = pd.read_csv('zips_practica.csv')
print(zips.shape)
zips.head(4)

(33099, 19)


Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone,c_lng
0,601,18.18,-66.75,Adjunta28s,PR,Puerto 345Rico,True,,17242,111.4,72001,Adjuntas,"{'72001':99.43,'72141':0.57}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico,
1,602,18.36,-67.18,A543guada,PR,P733uerto Rico,True,,38442,523.5,72003,Aguada,{'72003':100},Aguada,72003,False,False,America/Puerto_Rico,
2,603,18.45,-67.12,Ag188uadilla,PR,Pue888rto Rico,True,,48814,667.9,72005,Aguadilla,{'72005':100},Aguadilla,72005,False,False,America/Puerto_Rico,
3,606,18.17,-66.94,Ma297ricao,PR,Puerto Ric683o,True,,6437,60.4,72093,Maricao,"{'72093':94.88,'72121':1.35,'72153':3.78}",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico,


In [168]:
zips.dtypes

zip                   int64
lat                 float64
lng                 float64
city                 object
state_id             object
state_name           object
zcta                   bool
parent_zcta         float64
population            int64
density             float64
county_fips           int64
county_name          object
county_weights       object
county_names_all     object
county_fips_all      object
imprecise              bool
military               bool
timezone             object
c_lng               float64
dtype: object

In [169]:
spotify.dtypes

c_acousticness        float64
t_artists              object
c_danceability        float64
c_duration_ms           int64
c_energy              float64
t_id                   object
c_instrumentalness    float64
c_liveness            float64
c_loudness            float64
v_mode                  int64
t_name                 object
v_popularity          float64
d_release_date         object
c_speechiness         float64
c_tempo               float64
c_valence             float64
t_zip Code             object
v_genero               object
dtype: object

In [170]:
zips_subset = zips[['zip','lat','lng','city','state_name']]
print(zips_subset.shape)
zips_subset.head(3)

(33099, 5)


Unnamed: 0,zip,lat,lng,city,state_name
0,601,18.18,-66.75,Adjunta28s,Puerto 345Rico
1,602,18.36,-67.18,A543guada,P733uerto Rico
2,603,18.45,-67.12,Ag188uadilla,Pue888rto Rico


In [171]:
spotify.head(3)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,


In [172]:
spotify['t_zip Code'] = spotify['t_zip Code'].astype(str).astype(int)

In [173]:
# Merge spotify & zips_subset dataframes
from pandas.core.reshape.merge import merge
spotify = spotify.merge(zips_subset,right_on = 'zip',left_on = 't_zip Code', how = 'left')
print(spotify.shape)
spotify.head(5)

(165634, 23)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Osweg970o,N18ew York
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Bu542ffalo,New747 York
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Au287rora,277Utah
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,,22824,38.84,-78.63,Edinbur844g,630Virginia
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,,27519,35.81,-78.89,C492ary,N29orth Carolina


## **(m)** **Convert the variables "lat" and "lng" into a float type and validate data consistency**

In [174]:
# Convert 'lat' & 'lng' to float type variables
spotify['lat'] = spotify['lat'].astype(str).astype(float)
spotify['lng'] = spotify['lng'].astype(str).astype(float)

In [175]:
# Verify that 'lat' and 'lng are float type variables' --> consistency
spotify.dtypes

c_acousticness        float64
t_artists              object
c_danceability        float64
c_duration_ms           int64
c_energy              float64
t_id                   object
c_instrumentalness    float64
c_liveness            float64
c_loudness            float64
v_mode                  int64
t_name                 object
v_popularity          float64
d_release_date         object
c_speechiness         float64
c_tempo               float64
c_valence             float64
t_zip Code              int64
v_genero               object
zip                     int64
lat                   float64
lng                   float64
city                   object
state_name             object
dtype: object

In [176]:
spotify['lat'].value_counts()

39.98    20
45.28    19
31.43    19
39.85    19
41.67    19
         ..
39.16     1
37.31     1
34.89     1
34.89     1
41.57     1
Name: lat, Length: 32554, dtype: int64

In [177]:
# Validate 'lat' & 'lng' values
    ## Latitude must be a number between -90 and 90
    ## Longitude must a number between -180 and 180

def lat_val(value):
    if -90<=value<=+90:
        return 'Correct'
    else:
        return 'Incorrect'
    
def lng_val(value):
    if -180<=value<=+180:
        return 'Correct'
    else:
        return 'Incorrect'

In [178]:
# This temporal variable is created to validate 'lat' values are correct
spotify['lat_validation'] = spotify['lat'].apply(lambda row: lat_val(row))
spotify.head(5)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Osweg970o,N18ew York,Correct
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Bu542ffalo,New747 York,Correct
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Au287rora,277Utah,Correct
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,,22824,38.84,-78.63,Edinbur844g,630Virginia,Correct
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,,27519,35.81,-78.89,C492ary,N29orth Carolina,Correct


In [179]:
# This temporal variable is created to validate 'lng' values are correct
spotify['lng_validation'] = spotify['lng'].apply(lambda row: lat_val(row))
spotify.head(10)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Osweg970o,N18ew York,Correct,Correct
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Bu542ffalo,New747 York,Correct,Correct
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Au287rora,277Utah,Correct,Incorrect
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,,22824,38.84,-78.63,Edinbur844g,630Virginia,Correct,Correct
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,,27519,35.81,-78.89,C492ary,N29orth Carolina,Correct,Correct
5,0.04,['Tracy Lawrence'],0.61,173667,0.62,5eSPQr9YN3vxkpuOM30wks,0.0,0.27,-13.76,1,can t break it to my heart,34.0,1993-01-11 00:00:00,0.03,136.45,0.74,66527,,66527,39.75,-95.68,P696owhattan,Kans810as,Correct,Incorrect
6,0.0,['311'],0.62,257827,0.97,1JRxOhK2heeRsBsB2Dt4Vs,0.0,0.11,-6.31,1,do you right,34.0,1993-11-08 00:00:00,0.1,110.03,0.48,24060,,24060,37.26,-80.42,Blacksb286urg,Vir629ginia,Correct,Correct
7,0.03,['Bone Thugs-N-Harmony'],0.88,292147,0.47,27LvxLKUfzL6b6WMdOQAVU,0.0,0.17,-10.65,1,no shorts no losses,38.0,1995-04-23 00:00:00,0.24,141.38,0.87,58062,,58062,46.64,-97.83,Nom402e,North Dako638ta,Correct,Incorrect
8,0.13,['H.O.T.'],0.74,217227,0.98,0K25zmumCzn2kFmh9zcLWy,0.0,0.14,-2.32,1,candy,47.0,1996-02-27 00:00:00,0.04,119.99,0.95,72347,,72347,35.39,-91.0,Hickory 539Ridge,Ar160kansas,Correct,Incorrect
9,0.5,['Lee Ann Womack'],0.52,212920,0.28,1rVZ7EqoVJfrciRW80Uljh,0.0,0.13,-10.6,1,the fool,39.0,1997-06-25 00:00:00,0.03,80.1,0.29,44827,,44827,40.82,-82.76,Crestli544ne,686Ohio,Correct,Correct


In [180]:
# Checking 'lat' & 'lng' variables are 'Correct' for all records
spotify['lng_validation'].value_counts()

Correct      90980
Incorrect    74654
Name: lng_validation, dtype: int64

In [181]:
# Checking 'lat' & 'lng' variables are 'Correct' for all records
spotify['lat_validation'].value_counts()

Correct    165634
Name: lat_validation, dtype: int64

## **(n)** **From the variable "city" and "state" remove the digits found within the text strings**

In [182]:
spotify['city'] = spotify['city'].str.replace(r'[0-9]','')
spotify['state_name'] = spotify['state_name'].str.replace(r'[0-9]','')
print(spotify.shape)
spotify.head(5)

(165634, 25)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,,22824,38.84,-78.63,Edinburg,Virginia,Correct,Correct
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,,27519,35.81,-78.89,Cary,North Carolina,Correct,Correct


## **(o)** **Create a new variable called "state" that is made up of "city" & "state name"**

In [183]:
spotify['state'] = spotify[['city', 'state_name']].agg(', '.join, axis=1)
print(spotify.shape)
spotify.head(3)

(165634, 26)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"Oswego, New York"
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"Buffalo, New York"
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"Aurora, Utah"


## **(p)** **The values of the new variable "state", modify them in a certain way that all o them must be in lowercase and without accents**

In [184]:
# Convert 'state' column to lowercase
spotify['state'] = spotify['state'].apply(lambda x:x.lower())
spotify.head(5)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york"
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york"
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah"
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,,22824,38.84,-78.63,Edinburg,Virginia,Correct,Correct,"edinburg, virginia"
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,,27519,35.81,-78.89,Cary,North Carolina,Correct,Correct,"cary, north carolina"


In [185]:
# This function is used to delete 'accents' with the 'unicodedata' library
def delete_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [186]:
spotify['state'] = spotify['state'].apply(lambda row: delete_accents(row))
spotify.head(5)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26 00:00:00,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york"
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02 00:00:00,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york"
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15 00:00:00,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah"
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16 00:00:00,0.04,95.45,0.92,22824,,22824,38.84,-78.63,Edinburg,Virginia,Correct,Correct,"edinburg, virginia"
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15 00:00:00,0.05,85.33,0.41,27519,,27519,35.81,-78.89,Cary,North Carolina,Correct,Correct,"cary, north carolina"


In [187]:
spotify['d_release_date'].unique()

array(['1985-05-26 00:00:00', '1986-10-02 00:00:00',
       '1988-08-15 00:00:00', ..., '1928-11-08 00:00:00',
       '1928-10-24 00:00:00', '1930-01-04 00:00:00'], dtype=object)

## **(q)** **Convert the values in the variable "release date" to type datetime, also count those that do not have the necessary structure to be converted into datetime and delete those records**

In [188]:
# (q)-I Verify invalid records in 'release_date' (don't have the correct datetime format)
invalid_date = spotify['d_release_date'].str.contains(r'[a-zA-Z]').sum()
print(f'Total Records With Incorrect Structure: {invalid_date}')
print(f'Total Records: {spotify.shape[0]}')

Total Records With Incorrect Structure: 0
Total Records: 165634


### (q)-II Due to 0 incorrect structure records in 'release_date' --> delete not required

In [189]:
# (q)-III Convert 'release_date' records to datetime type
spotify['d_release_date'] = pd.to_datetime(spotify['d_release_date'], format = '%Y-%m-%d %H:%M:%S')
spotify.head(3)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york"
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york"
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah"


In [190]:
spotify.dtypes

c_acousticness               float64
t_artists                     object
c_danceability               float64
c_duration_ms                  int64
c_energy                     float64
t_id                          object
c_instrumentalness           float64
c_liveness                   float64
c_loudness                   float64
v_mode                         int64
t_name                        object
v_popularity                 float64
d_release_date        datetime64[ns]
c_speechiness                float64
c_tempo                      float64
c_valence                    float64
t_zip Code                     int64
v_genero                      object
zip                            int64
lat                          float64
lng                          float64
city                          object
state_name                    object
lat_validation                object
lng_validation                object
state                         object
dtype: object

# 2. **From the treated dataset obtain the following data:** 

## **(a)** **Get a DataFrame showing the record count by gender**

In [191]:
gender_count = spotify['v_genero'].value_counts()
gender_count = pd.DataFrame(gender_count)
gender_count = gender_count.rename(columns = {0:'v_genero','v_genero':'total'})
gender_count

Unnamed: 0,total
47.0,27168
20.0,24872
50.0,20154
12.0,18329
16.0,18067
21.0,17834
47.0,5812
20.0,5398
50.0,4247
21.0,3899


## **(b)** **Create a new variable called "duration_minutos" that is the value of the variable "duration_ms" in minutes**

In [192]:
spotify.head(3)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york"
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york"
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah"


In [193]:
spotify['duration_minutos'] = (spotify['c_duration_ms']/60)
print(spotify.shape)
spotify.head(3)

(165634, 27)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state,duration_minutos
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york",6668.22
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york",4844.0
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah",3865.35


## **(c)** **What are the 10 most popular songs?**

In [194]:
# Create a 'popularity' variable to identify most popular songs
popularity = spotify.sort_values(by = 'v_popularity', ascending = False)
popularity.reset_index(drop = True, inplace = True)
popularity.head(3)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state,duration_minutos
0,0.0,['The Weeknd'],0.51,200040,0.73,0VjIjW4GlUZAMYd2vXMi3b,0.0,0.09,-5.93,1,blinding lights,100.0,2020-03-11,0.06,171.0,0.33,78663,12.0,78663,30.42,-98.34,Round Mountain,Texas,Correct,Incorrect,"round mountain, texas",3334.0
1,0.25,"['DaBaby', 'Roddy Ricch']",0.75,181733,0.69,7ytR5pFWmSjzHJIeQkgog4,0.0,0.1,-7.96,1,rockstar feat roddy ricch,99.0,2020-06-20,0.16,89.98,0.5,46404,12.0,46404,41.58,-87.37,Gary,Indiana,Correct,Correct,"gary, indiana",3028.88
2,0.73,"['Powfu', 'beabadoobee']",0.73,173333,0.43,7eJMfftS33KTjuF7lTsMCx,0.0,0.7,-8.77,0,death bed coffee for your head feat beabad...,97.0,2020-04-27,0.14,144.03,0.35,38725,50.0,38725,33.65,-91.03,Benoit,Mississippi,Correct,Incorrect,"benoit, mississippi",2888.88


In [195]:
# 10 most popular songs base in 'v_popularity' value are:
most_popular = popularity[['t_artists','v_popularity']]
most_popular.reset_index(drop = True, inplace = True)
most_popular.head(10)

Unnamed: 0,t_artists,v_popularity
0,['The Weeknd'],100.0
1,"['DaBaby', 'Roddy Ricch']",99.0
2,"['Powfu', 'beabadoobee']",97.0
3,"['THE SCOTTS', 'Travis Scott', 'Kid Cudi']",96.0
4,['Drake'],95.0
5,['Roddy Ricch'],95.0
6,"['BENEE', 'Gus Dapperton']",95.0
7,"['Surf Mesa', 'Emilee']",95.0
8,['Tones And I'],94.0
9,"['Lady Gaga', 'Ariana Grande']",94.0


## **(d)** **What is the average duration in minutes and milliseconds?**

In [196]:
# Get the average duration in 'minutes'
average_minutes = spotify['duration_minutos'].mean()
print(f'Average Duration In Minutes: {average_minutes}')

Average Duration In Minutes: 3856.467707113221


In [197]:
# Get the average duration in 'miliseconds'
average_miliseconds = spotify['c_duration_ms'].mean()
print(f'Average Duration In Miliseconds: {average_miliseconds}')

Average Duration In Miliseconds: 231388.06242679644


## **(e)** **What is the 'average' and 'count' of "energy" by "gender"?**

In [198]:
spotify.head(3)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state,duration_minutos
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york",6668.22
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york",4844.0
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah",3865.35


In [199]:
spotify.dtypes

c_acousticness               float64
t_artists                     object
c_danceability               float64
c_duration_ms                  int64
c_energy                     float64
t_id                          object
c_instrumentalness           float64
c_liveness                   float64
c_loudness                   float64
v_mode                         int64
t_name                        object
v_popularity                 float64
d_release_date        datetime64[ns]
c_speechiness                float64
c_tempo                      float64
c_valence                    float64
t_zip Code                     int64
v_genero                      object
zip                            int64
lat                          float64
lng                          float64
city                          object
state_name                    object
lat_validation                object
lng_validation                object
state                         object
duration_minutos             float64
d

In [200]:
spotify['v_genero'] = spotify['v_genero'].astype(str).astype(float)

In [201]:
# This is the average of 'energy' by 'gender'
energyByGenderMean = spotify.groupby('v_genero')['c_energy'].mean()
pd.DataFrame(energyByGenderMean)

Unnamed: 0_level_0,c_energy
v_genero,Unnamed: 1_level_1
1.0,0.49
2.0,0.48
3.0,0.47
12.0,0.49
16.0,0.49
20.0,0.49
21.0,0.49
30.0,0.49
47.0,0.49
48.0,0.49


In [202]:
# This is the count of 'energy' by 'gender'
energyByGenderCount = spotify.groupby('v_genero')['c_energy'].count()
pd.DataFrame(energyByGenderCount)

Unnamed: 0_level_0,c_energy
v_genero,Unnamed: 1_level_1
1.0,170
2.0,351
3.0,532
12.0,22103
16.0,21910
20.0,30270
21.0,21733
30.0,3232
47.0,32980
48.0,3117


## **(f)** **Which song has the lowest "loudness" and which song has the highest one**

In [203]:
# Get song with lowest 'loedness'
    ## As we can see, we have 9 songs with the same 'loudness' (minimum value = -60)
min_loadness_song = spotify[spotify['c_loudness'] == spotify['c_loudness'].min()]
min_loadness_song.reset_index(drop = True, inplace = True)
min_loadness_song

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state,duration_minutos
0,0.0,['Connie Francis'],0.0,179466,0.0,0zr7DJGTPUfAUmjM7crmt2,0.0,0.0,-60.0,0,hava nagilah,12.0,1960-11-03,0.0,0.0,0.0,27344,47.0,27344,35.73,-79.43,Siler City,North Carolina,Correct,Correct,"siler city, north carolina",2991.1
1,0.0,['Sarah Vaughan'],0.0,6467,0.0,3lRVIn6D6EUbvkOgPZAU1H,0.0,0.0,-60.0,0,pause track,0.0,1949-05-24,0.0,0.0,0.0,77619,47.0,77619,29.95,-93.92,Groves,Texas,Correct,Incorrect,"groves, texas",107.78
2,0.0,['Robert Earl Keen'],0.0,60372,0.0,0o12mLSQuXFgsh4e2Kc4e5,0.0,0.0,-60.0,0,silent track,35.0,1998-09-21,0.0,0.0,0.0,93541,20.0,93541,37.98,-119.12,Lee Vining,California,Correct,Incorrect,"lee vining, california",1006.2
3,0.0,['Atlas Fret'],0.0,132827,0.0,7cctPQS83y620UQtMd1ilL,0.0,0.0,-60.0,0,silent track,40.0,2007-08-24,0.0,0.0,0.0,47585,12.0,47585,38.27,-87.15,Stendal,Indiana,Correct,Correct,"stendal, indiana",2213.78
4,0.0,['Connie Francis'],0.0,253719,0.0,5KAJv7Bceihn1frqElloIb,0.0,0.0,-60.0,0,my yiddishe momme,12.0,1960-03-27,0.0,0.0,0.0,6357,47.0,6357,41.33,-72.22,Niantic,Connecticut,Correct,Correct,"niantic, connecticut",4228.65
5,0.0,['Benny Goodman'],0.0,5991,0.0,3IcXTeq9O2dpsSXsDj9naH,0.0,0.0,-60.0,0,pause track live,0.0,1938-01-05,0.0,0.0,0.0,14548,16.0,14548,42.98,-77.24,Shortsville,New York,Correct,Correct,"shortsville, new york",99.85
6,0.0,['Benny Goodman'],0.0,6362,0.0,523qs4UcGlQ6ycdha1VGqs,0.0,0.0,-60.0,0,pause track live,0.0,1938-02-13,0.0,0.0,0.0,7470,20.0,7470,40.95,-74.25,Wayne,New Jersey,Correct,Correct,"wayne, new jersey",106.03
7,0.0,['Future Rapper'],0.0,420000,0.0,0Rd7eiAZGayLT8TmrVpQzG,0.0,0.0,-60.0,0,staggerlee has his day at the beach,0.0,1949-01-10,0.0,0.0,0.0,15208,12.0,15208,40.45,-79.9,Pittsburgh,Pennsylvania,Correct,Correct,"pittsburgh, pennsylvania",7000.0
8,0.0,['Sarah Vaughan'],0.0,5108,0.0,0hr9kRUi2X4MXc72A4VxG4,0.0,0.0,-60.0,0,pause track,0.0,1949-02-24,0.0,0.0,0.0,95823,20.0,95823,38.47,-121.44,Sacramento,California,Correct,Incorrect,"sacramento, california",85.13


In [204]:
# Get song with highest 'loudness'
    ## As we can see, in this case we have just 1 song with maximum 'loudness'(maximum value = 3.85)
max_loadness_song = spotify[spotify['c_loudness'] == spotify['c_loudness'].max()]
max_loadness_song.reset_index(drop = True, inplace = True)
max_loadness_song

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state,duration_minutos
0,0.0,['Apocolothoth'],0.0,152476,0.11,050FXMyCrQJG01AT55Jvk1,1.0,0.58,3.85,0,sold,0.0,1936-10-06,0.0,0.0,0.0,37766,16.0,37766,36.4,-84.09,La Follette,Tennessee,Correct,Correct,"la follette, tennessee",2541.27


In [205]:
# To validate results in the gotten number of songs:
    ## Lets see records based in 'c_loudness' for the lowest value
aux = spotify['c_loudness'].sort_values(ascending = True)
aux.reset_index(drop = True, inplace = True)
aux = pd.DataFrame(aux)
aux.head(10)

Unnamed: 0,c_loudness
0,-60.0
1,-60.0
2,-60.0
3,-60.0
4,-60.0
5,-60.0
6,-60.0
7,-60.0
8,-60.0
9,-55.0


In [206]:
# To validate results in the gotten number of songs above:
    ## Lets see records based in 'c_loudness' for the highest value
aux.tail(3)

Unnamed: 0,c_loudness
165631,2.8
165632,3.74
165633,3.85


## **(g)** **Obtain the 10 percentiles of the continuous variables**

In [207]:
spotify.head(3)

Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state,duration_minutos
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york",6668.22
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york",4844.0
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah",3865.35


In [208]:
# Get 10 percentiles for 'c_acousticness'
spotify['c_acousticness'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1])

count   165,634.00
mean          0.49
std           0.38
min           0.00
10%           0.01
20%           0.05
30%           0.14
40%           0.29
50%           0.48
60%           0.68
70%           0.83
80%           0.93
90%           0.98
100%          1.00
max           1.00
Name: c_acousticness, dtype: float64

In [209]:
# Get 10 percentiles for 'c_danceability'
spotify['c_danceability'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1])

count   165,634.00
mean          0.54
std           0.18
min           0.00
10%           0.30
20%           0.39
30%           0.45
40%           0.50
50%           0.55
60%           0.60
70%           0.64
80%           0.70
90%           0.76
100%          0.99
max           0.99
Name: c_danceability, dtype: float64

In [210]:
# Get 10 percentiles for 'c_duration_ms'
spotify['c_duration_ms'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1])

count     165,634.00
mean      231,388.06
std       120,842.92
min         5,108.00
10%       137,640.00
20%       162,560.00
30%       179,293.00
40%       193,620.20
50%       208,853.50
60%       227,533.00
70%       249,840.00
80%       279,765.20
90%       335,342.80
100%    5,403,500.00
max     5,403,500.00
Name: c_duration_ms, dtype: float64

In [211]:
# Get 10 percentiles for 'c_energy'
spotify['c_energy'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1])

count   165,634.00
mean          0.49
std           0.27
min           0.00
10%           0.13
20%           0.22
30%           0.31
40%           0.40
50%           0.48
60%           0.57
70%           0.67
80%           0.76
90%           0.87
100%          1.00
max           1.00
Name: c_energy, dtype: float64

In [212]:
# Get 10 percentiles for 'c_instrumentalness'
spotify['c_instrumentalness'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1])

count   165,634.00
mean          0.16
std           0.31
min           0.00
10%           0.00
20%           0.00
30%           0.00
40%           0.00
50%           0.00
60%           0.00
70%           0.02
80%           0.28
90%           0.83
100%          1.00
max           1.00
Name: c_instrumentalness, dtype: float64

In [213]:
# Get 10 percentiles for 'c_liveness'
spotify['c_liveness'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1])

count   165,634.00
mean          0.21
std           0.18
min           0.00
10%           0.07
20%           0.09
30%           0.10
40%           0.12
50%           0.14
60%           0.17
70%           0.22
80%           0.30
90%           0.41
100%          1.00
max           1.00
Name: c_liveness, dtype: float64

In [214]:
# Get 10 percentiles for 'c_loudness'
spotify['c_loudness'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1])

count   165,634.00
mean        -11.32
std           5.67
min         -60.00
10%         -18.87
20%         -15.57
30%         -13.45
40%         -11.86
50%         -10.42
60%          -9.07
70%          -7.73
80%          -6.41
90%          -4.99
100%          3.85
max           3.85
Name: c_loudness, dtype: float64

In [215]:
print(spotify.shape)
spotify.head(5)

(165634, 27)


Unnamed: 0,c_acousticness,t_artists,c_danceability,c_duration_ms,c_energy,t_id,c_instrumentalness,c_liveness,c_loudness,v_mode,t_name,v_popularity,d_release_date,c_speechiness,c_tempo,c_valence,t_zip Code,v_genero,zip,lat,lng,city,state_name,lat_validation,lng_validation,state,duration_minutos
0,0.0,['Slayer'],0.3,400093,0.92,0dt3XQL7LjTNyizTXY00yD,0.18,0.25,-8.36,1,crypts of eternity,33.0,1985-05-26,0.07,101.92,0.14,13126,,13126,43.43,-76.46,Oswego,New York,Correct,Correct,"oswego, new york",6668.22
1,0.33,['Miguel Mateos - Zas'],0.55,290640,0.97,096vJncZZrwi4lLUoggD9y,0.17,0.08,-7.04,0,mi sombra en la pared,45.0,1986-10-02,0.08,174.59,0.79,14223,,14223,42.97,-78.85,Buffalo,New York,Correct,Correct,"buffalo, new york",4844.0
2,0.0,['Paula Abdul'],0.73,231921,0.78,7xHYQboEmdZXWuXpJf9h30,0.0,0.05,-9.13,0,knocked out,32.0,1988-08-15,0.05,116.23,0.95,84620,,84620,38.91,-111.93,Aurora,Utah,Correct,Incorrect,"aurora, utah",3865.35
3,0.74,['Henry Fiol'],0.71,305058,0.63,6M5z2Pca6OuN4l5n5kId3E,0.0,0.09,-10.36,0,zumbale,46.0,1991-04-16,0.04,95.45,0.92,22824,,22824,38.84,-78.63,Edinburg,Virginia,Correct,Correct,"edinburg, virginia",5084.3
4,0.74,['Eric Clapton'],0.6,216800,0.22,5tdEWfBGNX7a7zD78tUwLZ,0.06,0.88,-15.05,0,walkin blues acoustic live at mtv unplugge...,45.0,1992-08-15,0.05,85.33,0.41,27519,,27519,35.81,-78.89,Cary,North Carolina,Correct,Correct,"cary, north carolina",3613.33
