In [0]:
import math
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline





In [0]:

song_data_df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/FileStore/tables/song_data.csv")
)





In [0]:
# get a row count

print(song_data_df.count())

# print the schema (shape of your df)
print(song_data_df.printSchema())

# get the columns as a list
print(song_data_df.columns)
 
# get the columns and types as tuples in a list
print(song_data_df.dtypes)




1000000
root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- year: integer (nullable = true)

None
['song_id', 'title', 'release', 'artist_name', 'year']
[('song_id', 'string'), ('title', 'string'), ('release', 'string'), ('artist_name', 'string'), ('year', 'int')]


In [0]:
print(song_data_df.printSchema())


root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- year: integer (nullable = true)

None


In [0]:
song_data_df.show()


+------------------+--------------------+--------------------+--------------------+----+
|           song_id|               title|             release|         artist_name|year|
+------------------+--------------------+--------------------+--------------------+----+
|SOQMMHC12AB0180CB8|        Silent Night|Monster Ballads X...|    Faster Pussy cat|2003|
|SOVFVAK12A8C1350D9|         Tanssi vaan|         Karkuteillä|    Karkkiautomaatti|1995|
|SOGTUKN12AB017F4F1|   No One Could Ever|              Butter|      Hudson Mohawke|2006|
|SOBNYVR12A8C13558C|       Si Vos Querés|             De Culo|         Yerba Brava|2003|
|SOHSBXH12A8C13B0DF|    Tangle Of Aspens|Rene Ablaze Prese...|          Der Mystic|   0|
|SOZVAPQ12A8C13B63C|"Symphony No. 1 G...|Berwald: Symphoni...|    David Montgomery|   0|
|SOQVRHI12A6D4FB2D7|    We Have Got Love|Strictly The Best...|  Sasha / Turbulence|   0|
|SOEYRFT12AB018936C|   2 Da Beat Ch'yall|             Da Bomb|          Kris Kross|1993|
|SOPMIYT12A6D4F851E| 

In [0]:
song_data_df.count()


Out[588]: 1000000

In [0]:
print(song_data_df)
display(song_data_df)


DataFrame[song_id: string, title: string, release: string, artist_name: string, year: int]


song_id,title,release,artist_name,year
SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0
SOZVAPQ12A8C13B63C,"""Symphony No. 1 G minor """"Sinfonie Serieuse""""/Allegro con energia""",Berwald: Symphonies Nos. 1/2/3/4,David Montgomery,0
SOQVRHI12A6D4FB2D7,We Have Got Love,Strictly The Best Vol. 34,Sasha / Turbulence,0
SOEYRFT12AB018936C,2 Da Beat Ch'yall,Da Bomb,Kris Kross,1993
SOPMIYT12A6D4F851E,Goodbye,Danny Boy,Joseph Locke,0
SOJCFMH12A8C13B0C2,Mama_ mama can't you see ?,March to cadence with the US marines,The Sun Harbor's Chorus-Documentary Recordings,0


In [0]:
song_data_df.head()


Out[590]: Row(song_id='SOQMMHC12AB0180CB8', title='Silent Night', release='Monster Ballads X-Mas', artist_name='Faster Pussy cat', year=2003)

In [0]:
triplets_file_df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/FileStore/tables/triplets_file.csv")
)

In [0]:
# get a row count

print(triplets_file_df.count())

# print the schema (shape of your df)
print(triplets_file_df.printSchema())

# get the columns as a list
print(triplets_file_df.columns)
 
# get the columns and types as tuples in a list
print(triplets_file_df.dtypes)



2000000
root
 |-- user_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- listen_count: integer (nullable = true)

None
['user_id', 'song_id', 'listen_count']
[('user_id', 'string'), ('song_id', 'string'), ('listen_count', 'int')]


In [0]:
print(triplets_file_df)
display(triplets_file_df)


DataFrame[user_id: string, song_id: string, listen_count: int]


user_id,song_id,listen_count
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODXRTY12AB0180F3B,1
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFGUAY12AB017B0A8,1
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFRQTD12A81C233C0,1
b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOHQWYZ12A6D4FA701,1


In [0]:
song_data_df.show()


+------------------+--------------------+--------------------+--------------------+----+
|           song_id|               title|             release|         artist_name|year|
+------------------+--------------------+--------------------+--------------------+----+
|SOQMMHC12AB0180CB8|        Silent Night|Monster Ballads X...|    Faster Pussy cat|2003|
|SOVFVAK12A8C1350D9|         Tanssi vaan|         Karkuteillä|    Karkkiautomaatti|1995|
|SOGTUKN12AB017F4F1|   No One Could Ever|              Butter|      Hudson Mohawke|2006|
|SOBNYVR12A8C13558C|       Si Vos Querés|             De Culo|         Yerba Brava|2003|
|SOHSBXH12A8C13B0DF|    Tangle Of Aspens|Rene Ablaze Prese...|          Der Mystic|   0|
|SOZVAPQ12A8C13B63C|"Symphony No. 1 G...|Berwald: Symphoni...|    David Montgomery|   0|
|SOQVRHI12A6D4FB2D7|    We Have Got Love|Strictly The Best...|  Sasha / Turbulence|   0|
|SOEYRFT12AB018936C|   2 Da Beat Ch'yall|             Da Bomb|          Kris Kross|1993|
|SOPMIYT12A6D4F851E| 

In [0]:
triplets_file_df.show()


+--------------------+------------------+------------+
|             user_id|           song_id|listen_count|
+--------------------+------------------+------------+
|b80344d063b5ccb32...|SOAKIMP12A8C130995|           1|
|b80344d063b5ccb32...|SOBBMDR12A8C13253B|           2|
|b80344d063b5ccb32...|SOBXHDL12A81C204C0|           1|
|b80344d063b5ccb32...|SOBYHAJ12A6701BF1D|           1|
|b80344d063b5ccb32...|SODACBL12A8C13C273|           1|
|b80344d063b5ccb32...|SODDNQT12A6D4F5F7E|           5|
|b80344d063b5ccb32...|SODXRTY12AB0180F3B|           1|
|b80344d063b5ccb32...|SOFGUAY12AB017B0A8|           1|
|b80344d063b5ccb32...|SOFRQTD12A81C233C0|           1|
|b80344d063b5ccb32...|SOHQWYZ12A6D4FA701|           1|
|b80344d063b5ccb32...|SOIYTOA12A6D4F9A23|           1|
|b80344d063b5ccb32...|SOIZAZL12A6701C53B|           5|
|b80344d063b5ccb32...|SOJNNUA12A8AE48C7A|           1|
|b80344d063b5ccb32...|SOJPFQG12A58A7833A|           1|
|b80344d063b5ccb32...|SOKRIMP12A6D4F5DA3|           5|
|b80344d06

In [0]:
#Convert PySpark Dataframe to Pandas DataFrame

triplets_file_df_pd = triplets_file_df.toPandas()
print(triplets_file_df_pd)

                                          user_id             song_id  \
0        b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995   
1        b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B   
2        b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0   
3        b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D   
4        b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273   
...                                           ...                 ...   
1999995  d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92  SOJEYPO12AAA8C6B0E   
1999996  d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92  SOJJYDE12AF729FC16   
1999997  d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92  SOJKQSF12A6D4F5EE9   
1999998  d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92  SOJUXGA12AC961885C   
1999999  d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92  SOJYOLS12A8C13C06F   

         listen_count  
0                   1  
1                   2  
2                   1  
3                   1  
4  

In [0]:
#Convert PySpark Dataframe to Pandas DataFrame

song_data_df_pd = song_data_df.toPandas()
print(song_data_df_pd)

                   song_id                                title  \
0       SOQMMHC12AB0180CB8                         Silent Night   
1       SOVFVAK12A8C1350D9                          Tanssi vaan   
2       SOGTUKN12AB017F4F1                    No One Could Ever   
3       SOBNYVR12A8C13558C                        Si Vos Querés   
4       SOHSBXH12A8C13B0DF                     Tangle Of Aspens   
...                    ...                                  ...   
999995  SOTXAME12AB018F136                      O Samba Da Vida   
999996  SOXQYIQ12A8C137FBB                         Jago Chhadeo   
999997  SOHODZI12A8C137BB3                              Novemba   
999998  SOLXGOR12A81C21EB7                              Faraday   
999999  SOWXJXQ12AB0189F43  Fernweh feat. Sektion Kuchikäschtli   

                                     release       artist_name  year  
0                      Monster Ballads X-Mas  Faster Pussy cat  2003  
1                                Karkuteillä  Karkkia

In [0]:

#Find Pandas DataFrame Size, Shape, and Dimensions Properties


print("size is song_data" ,song_data_df_pd.shape)
print("total data count in song_data" ,song_data_df_pd.size)



size is song_data (1000000, 5)
total data count in song_data 5000000


In [0]:

#Find Pandas DataFrame Size, Shape, and Dimensions Properties


print("size is triplets_file" ,triplets_file_df_pd.shape)
print("total data count in triplets_file" ,triplets_file_df_pd.size)



size is triplets_file (2000000, 3)
total data count in triplets_file 6000000


In [0]:
song_data_df_pd

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0
...,...,...,...,...,...
999995,SOTXAME12AB018F136,O Samba Da Vida,Pacha V.I.P.,Kiko Navarro,0
999996,SOXQYIQ12A8C137FBB,Jago Chhadeo,Naale Baba Lassi Pee Gya,Kuldeep Manak,0
999997,SOHODZI12A8C137BB3,Novemba,Dub_Connected: electronic music,Gabriel Le Mar,0
999998,SOLXGOR12A81C21EB7,Faraday,The Trance Collection Vol. 2,Elude,0


In [0]:
triplets_file_df_pd

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1
...,...,...,...
1999995,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJEYPO12AAA8C6B0E,2
1999996,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJJYDE12AF729FC16,4
1999997,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJKQSF12A6D4F5EE9,3
1999998,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJUXGA12AC961885C,1


In [0]:
##Size of the two datasets for a reference
print(len(triplets_file_df_pd),len(song_data_df_pd))

2000000 1000000


In [0]:
####Joining Data Frame


join_song_df = pd.merge(triplets_file_df_pd, song_data_df_pd.drop_duplicates(['song_id']), on='song_id', how='left')
join_song_df.head()


Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [0]:
print(len(join_song_df))

2000000


In [0]:
join_song_df

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
...,...,...,...,...,...,...,...
1999995,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJEYPO12AAA8C6B0E,2,Ignorance (Album Version),Ignorance,Paramore,0
1999996,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJJYDE12AF729FC16,4,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009
1999997,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJKQSF12A6D4F5EE9,3,What I've Done (Album Version),What I've Done,Linkin Park,2007
1999998,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJUXGA12AC961885C,1,Up,My Worlds,Justin Bieber,2010


In [0]:
#Standardising the missing data values
#Sometimes missing data is coded as ‘NO DATA’, ‘0’, ‘N/A’ or just an empty string. For ease of cleaning, convert all these into np.nan.

clean_song_df = join_song_df.replace([ 0, ''],np.nan)


In [0]:

clean_song_df

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0,The Cove,Thicker Than Water,Jack Johnson,
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976.0
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007.0
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1.0,Constellations,In Between Dreams,Jack Johnson,2005.0
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1.0,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999.0
...,...,...,...,...,...,...,...
1999995,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJEYPO12AAA8C6B0E,2.0,Ignorance (Album Version),Ignorance,Paramore,
1999996,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJJYDE12AF729FC16,4.0,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009.0
1999997,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJKQSF12A6D4F5EE9,3.0,What I've Done (Album Version),What I've Done,Linkin Park,2007.0
1999998,d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92,SOJUXGA12AC961885C,1.0,Up,My Worlds,Justin Bieber,2010.0


In [0]:
#to check the datatypes
clean_song_df.dtypes


Out[608]: user_id          object
song_id          object
listen_count    float64
title            object
release          object
artist_name      object
year            float64
dtype: object

In [0]:
len(join_song_df)


Out[609]: 2000000

In [0]:
# Preprocessing the data

join_song_df['song'] = join_song_df['title']+' - '+join_song_df['artist_name']
join_song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,Learn To Fly - Foo Fighters


In [0]:
# Taking a small sample for refrence
join_song_df = join_song_df.head(5000)
join_song_df.head(10000)

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,Learn To Fly - Foo Fighters
...,...,...,...,...,...,...,...,...
4995,e21477efb83bd323205ce6f5bd662f3df9d477e5,SORKVID12A6D4FC6D2,2,Out There On The Ice,In Ghost Colours,Cut Copy,2008,Out There On The Ice - Cut Copy
4996,e21477efb83bd323205ce6f5bd662f3df9d477e5,SOSAUVD12A67ADE6AE,1,I Know It's Over,The Queen Is Dead,The Smiths,1986,I Know It's Over - The Smiths
4997,e21477efb83bd323205ce6f5bd662f3df9d477e5,SOSDNSV12AB0181074,1,Human,Clubland Summer 2009,The Killers,2008,Human - The Killers
4998,e21477efb83bd323205ce6f5bd662f3df9d477e5,SOSMQOO12A8C13BAC0,3,Everyday I Love You Less and Less,Employment,Kaiser Chiefs,2005,Everyday I Love You Less and Less - Kaiser Chiefs


In [0]:
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage'] = (song_grouped['listen_count'] / grouped_sum ) * 100
song_grouped.sort_values(['listen_count', 'song'], ascending=[0,1])

Unnamed: 0,song,listen_count,percentage
2315,Sehr kosmisch - Harmonia,26,0.52
2946,Undo - Björk,16,0.32
3206,You're The One - Dwight Yoakam,16,0.32
672,Dog Days Are Over (Radio Edit) - Florence + Th...,14,0.28
2967,Use Somebody - Kings Of Leon,14,0.28
...,...,...,...
3222,Zebra - Beach House,1,0.02
3223,Zopf: Pigtail - Penguin Café Orchestra,1,0.02
3226,high fives - Four Tet,1,0.02
3227,ný Batterý - Sigur Ros,1,0.02


In [0]:
#Class initialisation based on Popularity Recommendation 
class pop_rec_py():
    def __init__(self):
        self.item_id = None
        self.trained_data = None
        self.pop_rec = None
        self.user_id = None
    #Creating the popularity based recommender model
    def create(self, trained_data, user_id, item_id):
        self.trained_data = trained_data
        self.item_id = item_id
        self.user_id = user_id
        #Get a count of userids for each distinct song as recommendation score
        trained_data_group = trained_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        trained_data_group.rename(columns = {'user_id': 'score'},inplace=True)
    
        #Sorting the tracks based upon suggestion score
        trained_data_sorting = trained_data_group.sort_values(['score', self.item_id], ascending = [0,1])
    
        #Generate a suggetion  based upon score
        trained_data_sorting['Rank'] = trained_data_sorting['score'].rank(ascending=0, method='first')
        
        
        self.pop_rec = trained_data_sorting.head(10)  #Get the top 10 recommendations
        
    #Use the popularity based recommender system model for makeing suggestions
    
    def rec(self, user_id):    
        user_rec = self.pop_rec
        
       
        user_rec['user_id'] = user_id
    
        
        cols = user_rec.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_rec = user_rec[cols]
        
        return user_rec

#Class initialisation for similarity  suggestion model
class item_sim_rec_py():
    def __init__(self):
        self.trained_data = None
        self.user_id = None
        self.item_id = None
        self.cooccur_mat = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_sim_rec = None
        
    #Getting unique songs corresponding to a given user
    def get_user_items(self, user):
        user_data = self.trained_data[self.trained_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    
    def get_item_users(self, item):  #Get unique users for a given  song
        item_data = self.trained_data[self.trained_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    
    def get_all_items_trained_data(self):  #Get unique songs in the training data
        all_items = list(self.trained_data[self.item_id].unique())
            
        return all_items
        
    
    def construct_cooccur_mat(self, usersongs, all_songs):   #Constructing cooccurence matrix
            
        
        #Get listeners for all songs in dataset.
        
        user_songs_users = []        
        for i in range(0, len(usersongs)):
            user_songs_users.append(self.get_item_users(usersongs[i]))
            
       
        #Initialize the item cooccurence matrix of size 
        
        cooccur_mat = np.matrix(np.zeros(shape=(len(usersongs), len(all_songs))), float)
           
        
        #Calculating the similarity between user songs and personalised songs in the training data
        
    
        for i in range(0,len(all_songs)):
            #Calculating unique listeners  of songs
            songs_i_data = self.trained_data[self.trained_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(usersongs)):       
                    
                
                users_j = user_songs_users[j]   #Getting unique listeners  of song 
                    
                #Calculating intersection of listeners of songs i and j
                users_intersect = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersect) != 0:
                    
                    users_union = users_i.union(users_j)  #generating  union of listeners of songs i and j
                    
                    cooccur_mat[j,i] = float(len(users_intersect))/float(len(users_union))
                else:
                    cooccur_mat[j,i] = 0
                    
        
        return cooccur_mat

    
    #Use the cooccurence matrix for making top recommendations
    def generate_top_rec(self, user, cooccur_mat, all_songs, usersongs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccur_mat))
        
        #Calculating a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccur_mat.sum(axis=0)/float(cooccur_mat.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value and maintaining the corresponding score
    
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Creating  a new dataframe
        columns = ['user_id', 'song', 'score', 'rank']
       
        df = pd.DataFrame(columns=columns)
         
        #Filling the dataframe with top 15 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in usersongs and rank <= 15:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #if no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Creating the item similarity based recommender  model
    def create(self, trained_data, user_id, item_id):
        self.trained_data = trained_data
        self.user_id = user_id
        self.item_id = item_id

    
    def rec(self, user):
        
        
        # Get all unique songs for this user
        
        usersongs = self.get_user_items(user)    
            
        print("No. of unique songs for the user: %d" % len(usersongs))
        
        
         #Get all unique items (songs) in the training data
        
        all_songs = self.get_all_items_trained_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        
        # Construct item cooccurence matrix of size 
      
        cooccur_mat = self.construct_cooccur_mat(usersongs, all_songs)
        
        
        # Use the cooccurence matrix to make recommendations
        
        df_rec = self.generate_top_rec(user, cooccur_mat, all_songs, usersongs)
                
        return df_rec
    
    #similar items to given items
    def get_similar_items(self, item_list):
        
        usersongs = item_list
        
        
        # Get all unique songs in the training data
        
        all_songs = self.get_all_items_trained_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        
        
        cooccur_mat = self.construct_cooccur_mat(usersongs, all_songs)        # Construct item cooccurence matrix of size 

        
        
        # Use the cooccurence matrix to make recommendations
        
        user = ""
        df_rec = self.generate_top_rec(user, cooccur_mat, all_songs, usersongs)
         
        return df_rec
    
    
    



In [0]:
pr = pop_rec_py()


In [0]:
pr.create(join_song_df, 'user_id', 'song')


In [0]:
# display the top 10 popular songs
pr.rec(join_song_df['user_id'][1250])


Unnamed: 0,user_id,song,score,Rank
2315,a58de017cbeda1763ea002fe027ed41b4ed53109,Sehr kosmisch - Harmonia,26,1.0
2946,a58de017cbeda1763ea002fe027ed41b4ed53109,Undo - Björk,16,2.0
3206,a58de017cbeda1763ea002fe027ed41b4ed53109,You're The One - Dwight Yoakam,16,3.0
672,a58de017cbeda1763ea002fe027ed41b4ed53109,Dog Days Are Over (Radio Edit) - Florence + Th...,14,4.0
2967,a58de017cbeda1763ea002fe027ed41b4ed53109,Use Somebody - Kings Of Leon,14,5.0
2198,a58de017cbeda1763ea002fe027ed41b4ed53109,Revelry - Kings Of Leon,13,6.0
2312,a58de017cbeda1763ea002fe027ed41b4ed53109,Secrets - OneRepublic,13,7.0
1162,a58de017cbeda1763ea002fe027ed41b4ed53109,Horn Concerto No. 4 in E flat K495: II. Romanc...,12,8.0
2763,a58de017cbeda1763ea002fe027ed41b4ed53109,The Scientist - Coldplay,12,9.0
1681,a58de017cbeda1763ea002fe027ed41b4ed53109,Marry Me - Train,11,10.0


In [0]:
pr.rec(join_song_df['user_id'][1700])


Unnamed: 0,user_id,song,score,Rank
2315,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Sehr kosmisch - Harmonia,26,1.0
2946,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Undo - Björk,16,2.0
3206,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,You're The One - Dwight Yoakam,16,3.0
672,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Dog Days Are Over (Radio Edit) - Florence + Th...,14,4.0
2967,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Use Somebody - Kings Of Leon,14,5.0
2198,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Revelry - Kings Of Leon,13,6.0
2312,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Secrets - OneRepublic,13,7.0
1162,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Horn Concerto No. 4 in E flat K495: II. Romanc...,12,8.0
2763,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,The Scientist - Coldplay,12,9.0
1681,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,Marry Me - Train,11,10.0


In [0]:
pr.rec(join_song_df['user_id'][1025])


Unnamed: 0,user_id,song,score,Rank
2315,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Sehr kosmisch - Harmonia,26,1.0
2946,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Undo - Björk,16,2.0
3206,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,You're The One - Dwight Yoakam,16,3.0
672,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Dog Days Are Over (Radio Edit) - Florence + Th...,14,4.0
2967,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Use Somebody - Kings Of Leon,14,5.0
2198,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Revelry - Kings Of Leon,13,6.0
2312,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Secrets - OneRepublic,13,7.0
1162,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Horn Concerto No. 4 in E flat K495: II. Romanc...,12,8.0
2763,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,The Scientist - Coldplay,12,9.0
1681,0afaa5d9d04bf85af720fe8cc566a41ca3e41c97,Marry Me - Train,11,10.0


In [0]:
#Item Similarity Recommendation

ir = item_sim_rec_py()
ir.create(join_song_df, 'user_id', 'song')

In [0]:
user_items = ir.get_user_items(join_song_df['user_id'][5])


In [0]:
# display user songs history
for user_item in user_items:
    print(user_item)

The Cove - Jack Johnson
Entre Dos Aguas - Paco De Lucia
Stronger - Kanye West
Constellations - Jack Johnson
Learn To Fly - Foo Fighters
Apuesta Por El Rock 'N' Roll - Héroes del Silencio
Paper Gangsta - Lady GaGa
Stacked Actors - Foo Fighters
Sehr kosmisch - Harmonia
Heaven's gonna burn your eyes - Thievery Corporation feat. Emiliana Torrini
Let It Be Sung - Jack Johnson / Matt Costa / Zach Gill / Dan Lebowitz / Steve Adams
I'll Be Missing You (Featuring Faith Evans & 112)(Album Version) - Puff Daddy
Love Shack - The B-52's
Clarity - John Mayer
I?'m A Steady Rollin? Man - Robert Johnson
The Old Saloon - The Lonely Island
Behind The Sea [Live In Chicago] - Panic At The Disco
Champion - Kanye West
Breakout - Foo Fighters
Ragged Wood - Fleet Foxes
Mykonos - Fleet Foxes
Country Road - Jack Johnson / Paula Fuga
Oh No - Andrew Bird
Love Song For No One - John Mayer
Jewels And Gold - Angus & Julia Stone
83 - John Mayer
Neon - John Mayer
The Middle - Jimmy Eat World
High and dry - Jorge Drexle

In [0]:
# give song recommendation for user
ir.rec(join_song_df['user_id'][130])


No. of unique songs for the user: 65
no. of unique songs in the training set: 3233
Non zero values in cooccurence_matrix :14971


Unnamed: 0,user_id,song,score,rank
0,b64cdd1a0bd907e5e00b39e345194768e330d652,Watch The Tapes - LCD Soundsystem,0.130085,1
1,b64cdd1a0bd907e5e00b39e345194768e330d652,Television Rules The Nation / Crescendolls - D...,0.130085,2
2,b64cdd1a0bd907e5e00b39e345194768e330d652,Don´t Believe The Hype - Boys Noize,0.130085,3
3,b64cdd1a0bd907e5e00b39e345194768e330d652,We Are Your Friends (Reprise) (Album Version) ...,0.130085,4
4,b64cdd1a0bd907e5e00b39e345194768e330d652,Work On You - MSTRKRFT,0.130085,5
5,b64cdd1a0bd907e5e00b39e345194768e330d652,Testament (feat. Gonja Sufi) - Flying Lotus,0.130085,6
6,b64cdd1a0bd907e5e00b39e345194768e330d652,Bruise - Octopus Project,0.130085,7
7,b64cdd1a0bd907e5e00b39e345194768e330d652,I GOT THIS DOWN - Simian Mobile Disco,0.130085,8
8,b64cdd1a0bd907e5e00b39e345194768e330d652,Tone Bank Jungle - Holy Fuck,0.130085,9
9,b64cdd1a0bd907e5e00b39e345194768e330d652,Jupiter Approach - Digitalism,0.130085,10


In [0]:
# give song recommendation for that user
ir.rec(join_song_df['user_id'][1240])


No. of unique songs for the user: 153
no. of unique songs in the training set: 3233
Non zero values in cooccurence_matrix :45341


Unnamed: 0,user_id,song,score,rank
0,a58de017cbeda1763ea002fe027ed41b4ed53109,Drops Of Jupiter - Train,0.087973,1
1,a58de017cbeda1763ea002fe027ed41b4ed53109,Eye Of The Tiger - Survivor,0.063737,2
2,a58de017cbeda1763ea002fe027ed41b4ed53109,Catch You Baby (Steve Pitron & Max Sanna Radio...,0.061397,3
3,a58de017cbeda1763ea002fe027ed41b4ed53109,Learn To Fly - Foo Fighters,0.060154,4
4,a58de017cbeda1763ea002fe027ed41b4ed53109,Oxford Comma (Album) - Vampire Weekend,0.059527,5
5,a58de017cbeda1763ea002fe027ed41b4ed53109,Tim McGraw - Taylor Swift,0.058425,6
6,a58de017cbeda1763ea002fe027ed41b4ed53109,The Climb - Miley Cyrus,0.058196,7
7,a58de017cbeda1763ea002fe027ed41b4ed53109,The Trouble With Love Is - Kelly Clarkson,0.055317,8
8,a58de017cbeda1763ea002fe027ed41b4ed53109,Dream On - Aerosmith,0.054378,9
9,a58de017cbeda1763ea002fe027ed41b4ed53109,Johnny_ I Hardly Knew Ya - Dropkick Murphys,0.054378,10


In [0]:
1# give related songs based on the words
ir.get_similar_items(['Oliver James - Fleet Foxes', 'The End - Pearl Jam'])

no. of unique songs in the training set: 3233
Non zero values in cooccurence_matrix :58


Unnamed: 0,user_id,song,score,rank
0,,Quiet Houses - Fleet Foxes,1.0,1
1,,St. Elsewhere - Dave Grusin,1.0,2
2,,Misled - Céline Dion,1.0,3
3,,Your Protector - Fleet Foxes,1.0,4
4,,Oil And Water - Incubus,1.0,5
5,,Tiger Mountain Peasant Song - Fleet Foxes,1.0,6
6,,Meadowlarks - Fleet Foxes,1.0,7
7,,Sun It Rises - Fleet Foxes,1.0,8
8,,Id Die Without You - P.M. Dawn,1.0,9
9,,Meet Virginia - Train,1.0,10


In [0]:
1# give related songs based on the words
ir.get_similar_items(['Your Protector - Fleet Foxes', 'Misled - Céline Dion	'])

no. of unique songs in the training set: 3233
Non zero values in cooccurence_matrix :29


Unnamed: 0,user_id,song,score,rank
0,,Oliver James - Fleet Foxes,0.5,1
1,,Quiet Houses - Fleet Foxes,0.5,2
2,,The End - Pearl Jam,0.5,3
3,,St. Elsewhere - Dave Grusin,0.5,4
4,,Misled - Céline Dion,0.5,5
5,,Oil And Water - Incubus,0.5,6
6,,Tiger Mountain Peasant Song - Fleet Foxes,0.5,7
7,,Meadowlarks - Fleet Foxes,0.5,8
8,,Sun It Rises - Fleet Foxes,0.5,9
9,,Id Die Without You - P.M. Dawn,0.5,10
