In [None]:
#==============================================================================
#
# IMPORT LIBRARIES
# 
#==============================================================================
import time
import pandas as pd
#pd.options.display.max_columns = None # Shows all columns
import string as str
import numpy as np
import matplotlib.pyplot as plt
import dill
import dask
import dask.dataframe as dd
import re
from pprint import pprint

In [None]:
#==============================================================================
# 
# DUMP/LOAD SESSIONS
# 
#==============================================================================
# ---- FULL SESSIONS ----
# Dump
#dill.dump_session('./working.db')

# Load
#dill.load_session('working.db')

# ---- OBJECTS ----
# Dump
#dill.dump(df_raw, open("df_raw.pkl", "wb"))

# Load
#dill_file = open("df_raw.pkl", "rb")
#df_raw = dill.load(dill_file)
#dill_file.close()

In [None]:
#==============================================================================
# 
# READ/EXTRACT RAW DATA
# 
#==============================================================================

file_train = 'data/dataset/yes_complete/train.txt'
print(pd.read_table(file_train, nrows=5, header=None))

In [None]:
with open(file_train) as f:
    train_raw = f.readlines()

In [None]:
playlists_train_raw = train_raw[2:]

In [None]:
# How many unique playlists?
print('There are ', len(playlists_train_raw), ' playlists.', sep='')

In [None]:
# Split and convert to integers
playlists_train_list = []
for i in range(0, len(playlists_train_raw)):
    playlists_train_list.append(list(map(int, playlists_train_raw[i].split())))

In [None]:
# Flatmap the playlists list
df_playlists_train = []
for i in range(0, len(playlists_train_list)):
    for j in range(0, len(playlists_train_list[i])):
        df_playlists_train.append([i, playlists_train_list[i][j]])

In [None]:
df_playlists_train = pd.DataFrame(df_playlists_train, columns=['playlist', 'song_id'])

In [None]:
len(df_playlists_train)

In [None]:
# Create index primary key
df_playlists_train.insert(loc=0, column='index', value=df_playlists_train.index)

In [None]:
# ---- Merge songs on song_id ----
df_song_hash = pd.read_table('data/dataset/yes_complete/song_hash.txt', header=None, names=['song_id','song','band'])

In [None]:
df_playlists_train = df_playlists_train.merge(df_song_hash, on='song_id')
df_playlists_train.sort_values('index', inplace=True)
df_playlists_train.reset_index(drop=True, inplace=True)

In [None]:
df_playlists_train.head()

In [None]:
df_playlists_train.shape

In [None]:
#==============================================================================
#
# EDA
# 
#==============================================================================

In [None]:
#song_count = pd.DataFrame(df_playlists_train.groupby(['song', 'band'])['playlist'].count().sort_values(ascending=False))
song_count = df_playlists_train.groupby(['song_id','song','band']).count().reset_index().drop(columns='index')
song_count.columns = ['song_id','song','band','count']
song_count = song_count.sort_values('count', ascending=False)
song_count.reset_index(inplace=True)
song_count = song_count[['song_id','song','band','count']]

In [None]:
%matplotlib qt
plt.figure(figsize=(10,4))
song_count['count'].hist(bins=70)
plt.xlabel('Song count')
plt.ylabel('Quantity')
plt.title('')
plt.rcParams.update({'font.size': 120})

In [None]:
%matplotlib qt
plt.figure(figsize=(10,4))
song_count['count'].hist(bins=10000)
plt.xlabel('Song count')
plt.ylabel('Quantity')
plt.title('')
plt.xlim(0,25)
plt.rcParams.update({'font.size': 10})

In [None]:
%matplotlib qt
plt.figure(figsize=(10,4))
song_count[song_count['count'] > 5]['count'].hist(bins=5000)
plt.xlabel('Song count')
plt.ylabel('Quantity')
plt.title('')
plt.xlim(5,20)
plt.rcParams.update({'font.size': 40})

In [None]:
band_count = pd.DataFrame(df_playlists_train.groupby('band')['playlist'].count().sort_values(ascending=False))
band_count.reset_index(inplace=True)
band_count.columns = ['band','count']

In [None]:
band_count.head()

In [None]:
%matplotlib qt
plt.figure(figsize=(10,4))
band_count['count'].hist(bins=70)
plt.xlabel('Band count')
plt.ylabel('Quantity')
plt.title('')
plt.rcParams.update({'font.size': 40})

In [None]:
%matplotlib qt
plt.figure(figsize=(10,4))
band_count['count'].hist(bins=1000)
plt.xlabel('Band count')
plt.ylabel('Quantity')
plt.title('')
plt.xlim(0,100)
plt.rcParams.update({'font.size': 40})

In [None]:
#==============================================================================
#
# CLEAN DATA
# 
#==============================================================================

In [None]:
df_playlists_train.head()

In [None]:
len(df_playlists_train)

In [None]:
# Add count column
df_playlists_train = pd.merge(df_playlists_train, song_count)
df_playlists_train.sort_values(by='index', inplace=True)
df_playlists_train.reset_index(drop=True, inplace=True)

In [None]:
df_playlists_train.head()

In [None]:
# ---- PARE DATA ----
# Remove songs that show up less than 6 times
df_playlists_train = df_playlists_train[df_playlists_train['count']>=6]

In [None]:
df_playlists_train.head()

In [None]:
df_playlists_train.sort_values('count').head()

In [None]:
df_playlists_train.sort_values('count').tail()

In [None]:
# ---- REMOVE NULLS ----

In [None]:
df_playlists_train.isnull().sum().sum()

In [None]:
df_playlists_train.isna().sum().sum()

In [None]:
# ---- REMOVE "", " ", "-" ----
df_playlists_train[df_playlists_train['song'] == '-'].head()

In [None]:
df_playlists_train.drop(df_playlists_train[df_playlists_train['song']=='-'].index, inplace=True)

In [None]:
df_playlists_train[df_playlists_train['song'] == '-']

In [None]:
df_playlists_train[df_playlists_train['band'] == '-'].head()

In [None]:
df_playlists_train.drop(df_playlists_train[df_playlists_train['band']=='-'].index, inplace=True)

In [None]:
df_playlists_train[df_playlists_train['band']=='-']

In [None]:
df_playlists_train[df_playlists_train['song']=='']

In [None]:
df_playlists_train[df_playlists_train['band']=='']

In [None]:
df_playlists_train[df_playlists_train['song']==' ']

In [None]:
df_playlists_train[df_playlists_train['band']==' ']

In [None]:
# ---- REMOVE ESCAPE CHARACTER '\' ----
df_playlists_train['song'] = [x.replace("\\","") for x in df_playlists_train['song']]
df_playlists_train['band'] = [x.replace("\\","") for x in df_playlists_train['band']]

In [None]:
# REPLACE [] WITH ()
df_playlists_train['song'] = [x.replace("[","(") for x in df_playlists_train['song']]
df_playlists_train['song'] = [x.replace("]",")") for x in df_playlists_train['song']]
df_playlists_train['band'] = [x.replace("[","(") for x in df_playlists_train['band']]
df_playlists_train['band'] = [x.replace("]",")") for x in df_playlists_train['band']]

In [None]:
# LOWER SONG AND BAND
df_playlists_train.head()

In [None]:
df_playlists_train['song'] = [x.lower() for x in df_playlists_train['song']]
df_playlists_train['band'] = [x.lower() for x in df_playlists_train['band']]

In [None]:
df_playlists_train.head()

In [None]:
# ---- CONCAT SONG AND BAND ----
df_playlists_train['song_band'] = df_playlists_train['song'] + " [" + df_playlists_train['band'] + "]"

# Rearrange columns
df_playlists_train = df_playlists_train[['index','playlist','song_id','song','band','song_band','count']]

In [None]:
df_playlists_train.tail()

In [None]:
df_playlists_train['song_id'].nunique()

In [None]:
df_playlists_train['song_band'].nunique()

In [None]:
# Why the difference?
df_playlists_train['song_id'].nunique() - df_playlists_train['song_band'].nunique()

In [None]:
#==============================================================================
#
# BUILD RECOMMENDER
#
#==============================================================================

In [None]:
# ---- CREATE PLAYLIST-SONG MATRIX ----
start_clock = time.clock()

# Create song_band dummies
#playlist_song_mat_train = pd.get_dummies(df_playlists_train['song_band'], sparse=True)
playlist_song_mat_train = pd.get_dummies(df_playlists_train['song_band'])

# Insert index and playlist columns
playlist_song_mat_train.insert(loc=0, column='playlist', value=df_playlists_train['playlist'])
playlist_song_mat_train.insert(loc=0, column='index', value=df_playlists_train['index'])

end_clock = time.clock()
    
print('\n')
print('Runtime: ', round((end_clock - start_clock)/60, 2), ' min', sep='')

In [None]:
playlist_song_mat_train.info()

In [None]:
# CONVERT TO DASK DATAFRAME 

In [None]:
start_clock = time.clock()
dd_playlist_song_mat_train = dd.from_pandas(data=playlist_song_mat_train, npartitions=1000)
end_clock = time.clock()

print('Runtime: ', round((end_clock - start_clock)/60, 2), ' min')

In [None]:
dd_playlist_song_mat_train.info()

In [None]:
# CLEAR MEMORY 

In [None]:
%who DataFrame

In [None]:
del(band_count, df_song_hash, playlist_song_mat_train, 
    playlists_train_list, playlists_train_raw, song_count, train_raw)

In [None]:
# GROUP BY PLAYLIST 

In [None]:
start_clock = time.clock()
playlist_song_mat_train = dd_playlist_song_mat_train.groupby('playlist').sum().compute()
playlist_song_mat_train.drop(columns='index', inplace=True)
end_clock = time.clock()

print('Runtime: ', round((end_clock - start_clock)/60, 2), ' min')

In [None]:
playlist_song_mat_train.info()

In [None]:
del(dd_playlist_song_mat_train)

In [None]:
playlist_song_mat_train.head()

In [None]:
# ---- CHECKS ----

In [None]:
# Check number of Nelly's #1 songs
len(df_playlists_train[df_playlists_train['song_band']=='#1 [nelly]'])

In [None]:
playlist_song_mat_train['#1 [nelly]'].sum()

In [None]:
# Check length of (arbitrarily chosen) playlist 472 (partially checks for re-indexing)
sum(playlist_song_mat_train[playlist_song_mat_train.columns[playlist_song_mat_train.iloc[472,:] > 0]].iloc[472])

In [None]:
len(df_playlists_train[df_playlists_train['playlist']==472])

In [None]:
# Check number of playlists
df_playlists_train['playlist'].nunique()

In [None]:
len(playlist_song_mat_train)

In [None]:
# Check number of unique songs
df_playlists_train['song_band'].nunique()

In [None]:
playlist_song_mat_train.shape[1]

In [None]:
# Convert every non-zero count to just 1, ie each playlist should count each unique song as appearing only once
# regardless of how often it actually appears.
playlist_song_mat_train = playlist_song_mat_train.astype(bool).astype(int)

In [None]:
playlist_song_mat_train['#1 [nelly]'].sum()

In [1]:
import dill
import pandas as pd

# Save out dataframes
# Dump
#dill.dump(df_playlists_train, open("df_playlists_train.pkl", "wb"))
#dill.dump(playlist_song_mat_train, open("playlist_song_mat_train.pkl", "wb"))

# Load
df_playlists_train = dill.load(open("df_playlists_train.pkl", "rb"))
playlist_song_mat_train = dill.load(open("playlist_song_mat_train.pkl", "rb"))

In [None]:
#==============================================================================
#
# RUN
#
#==============================================================================

In [2]:
# ---- SONG OR BAND INQUIRY ----
song_band_inquiry = input('What song or band do you like? ').lower()

What song or band do you like? rancid


In [3]:
options = pd.DataFrame([col for col in playlist_song_mat_train.columns if song_band_inquiry in col])[0]

In [4]:
options

0        'ruby soho [rancid]
1        l.a. river [rancid]
2    maxwell murder [rancid]
3        olympia wa [rancid]
4    roots radicals [rancid]
5         time bomb [rancid]
Name: 0, dtype: object

In [5]:
while True:
    print('\n')
    options
    try:
        song_band_number = int(input('What number? '))
    except:
        print('\n')
        print('*** Must input an integer. ***')
        continue
    else: 
        if (song_band_number in range(0,len(options))):
            song_band_choice = options[song_band_number]
            break
        else:
            print('\n')
            print('*** Choose a number from the table. ***')
            continue



What number? 4


In [6]:
song_band_choice

'roots radicals [rancid]'

In [7]:
'''# Return options
print('Copy the song [band] you are interested in:')
print('\n')
#pprint([col for col in playlist_song_mat_train.columns.str.lower() if song_band_inquiry.lower() in col])
pprint([col for col in playlist_song_mat_train.columns if song_band_inquiry in col])'''

"# Return options\nprint('Copy the song [band] you are interested in:')\nprint('\n')\n#pprint([col for col in playlist_song_mat_train.columns.str.lower() if song_band_inquiry.lower() in col])\npprint([col for col in playlist_song_mat_train.columns if song_band_inquiry in col])"

In [8]:
'''song_band = input('Input the song [band] of interest: ')'''

"song_band = input('Input the song [band] of interest: ')"

In [9]:
print('Top 3 suggested songs: ')
for x_song_band, x_count in \
    playlist_song_mat_train[playlist_song_mat_train[song_band_choice]==1]\
        .sum().sort_values(ascending=False)[1:4,].iteritems():
    print(' * ', x_song_band)

Top 3 suggested songs: 
 *  longview [green day]
 *  scar tissue [the red hot chili peppers]
 *  the sound (john m. perkins' blues) [switchfoot]


In [14]:
print(playlist_song_mat_train[playlist_song_mat_train[song_band_choice]==1]\
    .sum().sort_values(ascending=False))

roots radicals [rancid]                                                       9
longview [green day]                                                          5
scar tissue [the red hot chili peppers]                                       5
the sound (john m. perkins' blues) [switchfoot]                               5
say it ain't so [weezer]                                                      5
new slang [the shins]                                                         4
help is on the way [rise against]                                             4
dashboard [modest mouse]                                                      4
creep [radiohead]                                                             4
jane says [jane's addiction]                                                  4
possum kingdom [toadies]                                                      4
knights of cydonia [muse]                                                     4
shake me down [cage the elephant]       

In [20]:
playlist_song_mat_train[playlist_song_mat_train[song_band_choice]==1].sum()

 2112-  [rush]                                                                          0
 i got your country right here [gretchen wilson]                                        0
 ordinary world [duran duran]                                                           0
 the girls next door [pat metheny group]                                                0
#!*@ you tonight (w/ r. kelly) [the notorious b.i.g.]                                   0
#1 [nelly]                                                                              0
#1 crush [garbage]                                                                      0
#1 thing [amerie]                                                                       0
#9 dream [john lennon]                                                                  0
'03 bonnie & clyde (w/ beyonce) [jay-z]                                                 0
'74-'75 [the connells]                                                                  0
'a' dance 