# Utilize a Playlist Summary Vector to Recommend another Playlist

### Written by Rohan A. Rastogi 

####

# #Mothership

## Imports

In [1]:
import datetime

In [2]:
import glob

In [3]:
import numpy

In [4]:
import pandas

In [5]:
import psycopg2

In [6]:
import re as regex

In [7]:
import boto3

In [8]:
import io

#### 

## Settings

In [9]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [10]:
pandas.set_option('display.max_rows', 20)
pandas.set_option('display.max_columns', 20)
pandas.set_option('display.width', 2000)
pandas.set_option('display.max_colwidth', None)
pandas.set_option('display.max_seq_item', 20)
pandas.set_option('display.show_dimensions', True)

#### 

## Functions

#### Timestamp

In [11]:
def timestamp():
    return datetime.datetime.now().strftime("%H:%M:%S")    

#### 

#### Querying Warehouse

In [12]:
def connect_to_Warehouse():
    
    print('\n\nFunction connect_to_warehouse() invoked.')
    print('This function programatically connects you to Warehouse via remotely accessing Amazon Redshift.')
    
    warehouse_username = 'rrastogi'
    
    warehouse_passwords = { 
        'Production' : 'w3KjyVAytp+sIqre' , 
        'Staging' : '3hejPazQDxrwXROk' }
    
    warehouse_hosts = { 
        'Production' : 'redshift-prod.remedypartners.com' , 
        'Staging' : 'redshift-stage.remedypartners.com' }
    
    warehouse_databases = { 
        'Production' : 'rmdyreppreporting' , 
        'Staging' : 'rmdyrepsreporting' }
    
    warehouse_port = '5439'
    
    environment_name = input("\n\nPlease enter 'Production' or 'Staging' to connect to Warehouse's Production or Staging environments respectively.\n\n")
    
    try:
        
        warehouse_host = warehouse_hosts[environment_name]
        warehouse_database = warehouse_databases[environment_name]
        warehouse_password = warehouse_passwords[environment_name]
        
    except KeyError:
        
        print('\n\nFunction connect_to_warehouse() invocation failed to execute succesfully because you have not entered a name for a viable environment within Warehouse.')
        
    else:
        
        warehouse_connection = psycopg2.connect( 
            host = warehouse_host , 
            database = warehouse_database , 
            port = warehouse_port , 
            user = warehouse_username , 
            password = warehouse_password)
        
        warehouse_cursor = warehouse_connection.cursor()
        
        return(warehouse_connection, warehouse_cursor)
        print('\n\n You are succesfully connected to Warehouse.\n\n')
        
    finally:
        print()

####

#### Loading Incremental Files from the Box

In [13]:
def get_files_by_path(path):
    list_of_files = glob.glob(path)
    list_of_files.sort()
    number_of_files = len(list_of_files)
    print('\n\n', number_of_files, "files located at", path, '\n\n')
    
    for file in list_of_files:
        print(file)
        
    return list_of_files

In [14]:
def load_incremental_files(incremental_files, columns_to_load, columns_to_sieve = [], sieve_nets = []): 
    
    incremental_dataframes = [] 
    incremental_file_number = 0
    
    print('\n\nCommencing Data Loading from Incremental Files at', timestamp(),'\n\n')  
    
    for incremental_file in incremental_files:
         
        incremental_file_name = incremental_file.split('/')[my_index(-1)]
        incremental_file_number += 1
        
        (incremental_file_year, incremental_file_month_number, incremental_file_week_number) = ( 
            incremental_file.split('/')[-1].split('_')[-3][my_range(my_index(1), my_index(2), 1)], 
            incremental_file.split('/')[-1].split('_')[-3][my_range(my_index(-2), my_index(-1), 1)], 
            incremental_file.split('/')[-1].split('_')[-2])   
        
        incremental_file_data = pandas.read_table( 
            incremental_file , 
            delimiter = '|' , 
            header = 0 , 
            dtype = str ,
            engine = 'python' , 
            error_bad_lines = False ,
            usecols = columns_to_load)

        incremental_file_dataframe = incremetnal_file_data.loc[incremental_file_data.loc[:, columns_to_sieve].isin(sieve_nets), :].reset_index(drop = True)       
        incremental_file_dataframe['Incremental File Name'] = incremental_file_name
        incremental_file_dataframe['Incremental File (year, month number, week number)'] = '(' + incremental_file_year + ', ' + incremental_file_month_number + ', ' + incremental_file_week_number + ')'
        incremental_file_dataframe_shape = file_dataframe.shape
        incremental_file_timestamp = timestamp()
                
        print('Appending: \t', 'File Name:', file_name, 'File Number:', file_number, '\nDataframe Shape:', file_dataframe_shape, " Timestamp:", file_timestamp, '\n')
        incremental_dataframes.append(file_dataframe)


    print('\n\nCeased Data Loading of Incremental Files at', timestamp(),'\n\n') 

    incremental_files_dataframe = pandas.concat(incremental_dataframes, ignore_index = True)
    
    return incremental_files_dataframe

#### 

#### Priming DataFrames

In [15]:
def memberid_to_two_columns(memberid_value):
    member_id_value = memberid_value[:-2]
    personcode_value = memberid_value[-2:]
    return (member_id_value, personcode_value)

In [16]:
def row_contains_discrepancy(row, column_pairs_to_compare_within):    
    return any([(row[column_left] != row[column_right]) for (column_left, column_right) in column_pairs_to_compare_within])

In [17]:
def shift_columns_to_front(dataframe, columns_names): 
    output_dataframe = dataframe.copy(deep = True)
    
    for column_name in reversed(columns_names):  
        column = output_dataframe.pop(column_name)
        output_dataframe.insert(0, column_name, column)
    
    return output_dataframe

####

### my_

In [18]:
def my_index(index):

    if (index >= 1):    
        return (index - 1)
    
    elif (index == 0):        
        raise IndexError('Index cannot be 0')
    
    elif (index <= -1):
        return index

In [19]:
def my_range(start, stop, step):
    return range(start, (stop + 1), step)

In [20]:
def my_flatten(list_of_lists):
    return [element for list in list_of_lists for element in list]

In [21]:
def my_unique(list):
    unique = []
    for element in list:
        if element not in unique:
            unique.append(element)
    return unique

In [22]:
def my_pop(list):
    if len(list) == 0:
        output = None
    else:
        output = list.pop()
    return output

#### 

# #Captain

In [39]:
sample_playlist_dataframe = pandas.DataFrame({
    'name' : ['Song a', 'Song b', 'Song c', 'Song d', 'Song e'],
    'id' : ['a', 'b', 'c', 'd', 'e']})

In [40]:
sample_playlist_dataframe

Unnamed: 0,name,id
0,Song a,a
1,Song b,b
2,Song c,c
3,Song d,d
4,Song e,e


In [36]:
sample_playlist_features_vector = [0.1, 0.48, 0.89]

In [41]:
sample_playlist_features_vector

[0.1, 0.48, 0.89]

In [42]:
sample_songpool_features_dataframe = pandas.DataFrame({
    'id' : [ 'a', 'b', 'c', 'd', 'e'],
    'mode' : [0, 0, 0, 0, 0.5],
    'key' : [0.5, 0.5, 0, 0.5, 0.5],
    'genre|rock' : [1, 1, 0, 0, 0]
})

In [43]:
sample_songpool_features_dataframe

Unnamed: 0,id,mode,key,genre|rock
0,a,0.0,0.5,1
1,b,0.0,0.5,1
2,c,0.0,0.0,0
3,d,0.0,0.5,0
4,e,0.5,0.5,0


In [48]:
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
def recommend_similar_playlist(playlist_dataframe, playlist_features_vector, songpool_features_dataframe, number = 20): 
    # ensure songpool does not contain the songs present within the input playlist
    
    songpool_features_dataframe = playlist_dataframe[playlist_dataframe['id'].isin(songpool_features_dataframe['id'].values)]
    
    # utilize cosine similarity metric between the playlist and the complete song set
    songpool_features_dataframe['Similarity'] = cosine_similarity(songpool_features_dataframe.drop('id', axis = 1).values, playlist_features_vector.values.reshape(1, -1))[:,0]
    output = songpool_features_dataframe.sort_values('Similarity',ascending = False).head(number)
    
    return output

In [50]:
recommend_similar_playlist(sample_playlist_dataframe, sample_playlist_features_vector, sample_songpool_features_dataframe)

AttributeError: 'list' object has no attribute 'values'