In [1]:
import pandas as pd
import sqlite3

In [2]:
#Read dataset
tid_genre = pd.read_table('msd_tagtraum_cd1.cls', sep = '\t', names = ['tid','majority_genre','minority_genre'])

In [5]:
#Add table to database
_DBName = '../../MillionSongSubset/lastfm_tags.db'
_TableName = u'track_genre_clean'

In [6]:
#Function taken from http://yznotes.com/write-pandas-dataframe-to-sqlite/
def df2sqlite(dataframe, db_name, tbl_name):
    conn=sqlite3.connect(db_name)
    conn.text_factory = lambda x: unicode(x, "utf-8", "ignore")    
    cur = conn.cursor()                                 
 
    wildcards = ','.join(['?'] * len(dataframe.columns))    
    data = [tuple(x) for x in dataframe.values]

    cur.execute("drop table if exists %s" % tbl_name)
 
    col_str = '"' + '","'.join(dataframe.columns) + '"'
    cur.execute("create table %s (%s)" % (tbl_name, col_str))
 
    cur.executemany("insert into %s values(%s)" % (tbl_name, wildcards), data)
 
    conn.commit()
    conn.close()

In [8]:
df2sqlite(tid_genre, db_name = _DBName, tbl_name = _TableName)

In [9]:
# See the tables
conn=sqlite3.connect(_DBName)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[(u'tags',), (u'tids',), (u'tid_tag',), (u'track_year',), (u'track_acoustic',), (u'track_genre_clean',)]


In [12]:
#View Genres
query = "SELECT DISTINCT majority_genre FROM track_genre_clean ORDER BY majority_genre"
df = pd.read_sql_query(query, conn)
df

Unnamed: 0,majority_genre
0,Blues
1,Country
2,Electronic
3,Folk
4,International
5,Jazz
6,Latin
7,New Age
8,Pop_Rock
9,Rap


In [16]:
#Number of songs with year
query = "SELECT COUNT(*), MIN(Year), MAX(Year) FROM track_genre_clean JOIN track_year USING (tid)"
df = pd.read_sql_query(query, conn)
df

Unnamed: 0,COUNT(*),MIN(Year),MAX(Year)
0,112321,1922,2010


In [15]:
#Pivot table number of songs Majority-Minority OVERALL
query_pivot = "SELECT LOWER(majority_genre) AS Genre, \
SUM(CASE LOWER(minority_genre) WHEN 'blues' THEN 1 ELSE 0 END) AS Blues, \
SUM(CASE LOWER(minority_genre) WHEN 'country' THEN 1 ELSE 0 END) AS Country, \
SUM(CASE LOWER(minority_genre) WHEN 'electronic' THEN 1 ELSE 0 END) AS Electronic, \
SUM(CASE LOWER(minority_genre) WHEN 'folk' THEN 1 ELSE 0 END) AS Folk, \
SUM(CASE LOWER(minority_genre) WHEN 'international' THEN 1 ELSE 0 END) AS International, \
SUM(CASE LOWER(minority_genre) WHEN 'jazz' THEN 1 ELSE 0 END) AS Jazz, \
SUM(CASE LOWER(minority_genre) WHEN 'latin' THEN 1 ELSE 0 END) AS Latin, \
SUM(CASE LOWER(minority_genre) WHEN 'new age' THEN 1 ELSE 0 END) AS NewAge, \
SUM(CASE LOWER(minority_genre) WHEN 'pop_rock' THEN 1 ELSE 0 END) AS PopRock, \
SUM(CASE LOWER(minority_genre) WHEN 'rap' THEN 1 ELSE 0 END) AS Rap, \
SUM(CASE LOWER(minority_genre) WHEN 'reggae' THEN 1 ELSE 0 END) AS Reggae, \
SUM(CASE LOWER(minority_genre) WHEN 'rnb' THEN 1 ELSE 0 END) AS RnB, \
SUM(CASE LOWER(minority_genre) WHEN 'vocal' THEN 1 ELSE 0 END) AS Vocal \
FROM ({0}) \
GROUP BY Genre ORDER BY Genre;".format('track_genre_clean')
df = pd.read_sql_query(query_pivot, conn)
df

Unnamed: 0,Genre,Blues,Country,Electronic,Folk,International,Jazz,Latin,NewAge,PopRock,Rap,Reggae,RnB,Vocal
0,blues,0,16,7,23,6,84,1,1,722,4,5,69,1
1,country,17,0,2,205,3,21,1,0,1017,3,3,5,2
2,electronic,30,3,0,20,141,496,29,140,4890,397,117,51,15
3,folk,52,180,5,0,439,17,6,5,1283,2,2,0,17
4,international,4,6,165,281,0,64,43,44,223,4,29,6,14
5,jazz,48,5,242,6,68,0,114,78,807,27,6,148,874
6,latin,9,6,66,41,128,130,0,4,1014,28,34,6,16
7,new age,3,1,213,43,52,262,12,0,166,2,0,0,2
8,pop_rock,1173,1169,4590,2848,651,983,982,114,0,476,646,950,303
9,rap,7,5,327,1,43,81,37,1,613,0,32,98,0


In [18]:
#Pivot table number of songs Majority-Minority By YEAR
minYear = 1940
maxYear = 2010
pivot_dict_newData = {}

for year in range(minYear,maxYear+1):
    print(year)
    query_pivot = "SELECT LOWER(majority_genre) AS Genre, \
SUM(CASE LOWER(minority_genre) WHEN 'blues' THEN 1 ELSE 0 END) AS Blues, \
SUM(CASE LOWER(minority_genre) WHEN 'country' THEN 1 ELSE 0 END) AS Country, \
SUM(CASE LOWER(minority_genre) WHEN 'electronic' THEN 1 ELSE 0 END) AS Electronic, \
SUM(CASE LOWER(minority_genre) WHEN 'folk' THEN 1 ELSE 0 END) AS Folk, \
SUM(CASE LOWER(minority_genre) WHEN 'international' THEN 1 ELSE 0 END) AS International, \
SUM(CASE LOWER(minority_genre) WHEN 'jazz' THEN 1 ELSE 0 END) AS Jazz, \
SUM(CASE LOWER(minority_genre) WHEN 'latin' THEN 1 ELSE 0 END) AS Latin, \
SUM(CASE LOWER(minority_genre) WHEN 'new age' THEN 1 ELSE 0 END) AS NewAge, \
SUM(CASE LOWER(minority_genre) WHEN 'pop_rock' THEN 1 ELSE 0 END) AS PopRock, \
SUM(CASE LOWER(minority_genre) WHEN 'rap' THEN 1 ELSE 0 END) AS Rap, \
SUM(CASE LOWER(minority_genre) WHEN 'reggae' THEN 1 ELSE 0 END) AS Reggae, \
SUM(CASE LOWER(minority_genre) WHEN 'rnb' THEN 1 ELSE 0 END) AS RnB, \
SUM(CASE LOWER(minority_genre) WHEN 'vocal' THEN 1 ELSE 0 END) AS Vocal \
FROM ({0}) JOIN track_year USING (tid)\
WHERE Year = {1} \
GROUP BY Genre ORDER BY Genre;".format('track_genre_clean', str(year))
    df = pd.read_sql_query(query_pivot, conn)
    pivot_dict_newData[year] = df

1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010


In [21]:
pivot_dict_newData[1990]

Unnamed: 0,Genre,Blues,Country,Electronic,Folk,International,Jazz,Latin,NewAge,PopRock,Rap,Reggae,RnB,Vocal
0,blues,0,0,0,0,0,0,0,0,24,0,0,0,0
1,country,2,0,0,2,0,0,0,0,6,0,0,0,0
2,electronic,0,0,0,0,0,0,0,5,29,2,2,1,0
3,folk,0,1,0,0,4,0,0,0,3,0,0,0,0
4,international,0,0,0,8,0,0,0,0,0,0,0,0,2
5,jazz,0,0,0,0,0,0,0,0,8,0,0,7,17
6,latin,0,0,0,0,0,1,0,0,5,0,0,0,0
7,new age,0,0,2,0,1,4,0,0,0,0,0,0,0
8,pop_rock,29,12,30,15,0,9,9,0,0,5,2,14,6
9,rap,0,1,0,0,0,1,0,0,3,0,0,0,0


In [22]:
#Pickle the pivot dictionary
import pickle
output_file = open('pivot_dict_newData.dat','wb')
pickle.dump(pivot_dict_newData, output_file)
output_file.close()