# Songs Recommendation System using Collaborative Filtering

## 1. Download dataset through kaggle


In [1]:
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/rishabjn/bollywoodsongs/download")

Collecting opendatasets
  Downloading opendatasets-0.1.20-py3-none-any.whl (14 kB)
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
     |████████████████████████████████| 58 kB 10.9 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting python-slugify
  Downloading python_slugify-5.0.2-py2.py3-none-any.whl (6.7 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
     |████████████████████████████████| 78 kB 13.9 MB/s             
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73053 sha256=1fe641cb2a82302ce61c80113bb948c2e394db5948aefb045580a64242664939
  Stored in directory: /home/ec2-user/.cache/pip/wheels/77/47/e4/44a4ba1b7dfd53faaa35f59f1175e123b213ff401a8a56876b
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle, opendataset

 mamatapoudel


Your Kaggle Key: 

 ································


Downloading bollywoodsongs.zip to ./bollywoodsongs


100%|██████████| 58.9k/58.9k [00:00<00:00, 34.9MB/s]







## 2. Read data and perform train and test split

In [3]:
import os
import boto3
import sagemaker
import numpy as np
import pandas as pd

In [11]:
fpath = './bollywoodsongs/ex.csv'   #data read 
df = pd.read_csv(fpath,encoding='Windows-1252')

In [12]:
import chardet
with open(fpath, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

In [13]:
df.head(8)

Unnamed: 0,SongID,SongName,Singer,Genre,Movie,UserRating,UserID
0,1,Aankh Marey,"Kumar Sanu, Mika Singh, Neha Kakkar",BollywoodDance,Simmba,6.8/10,1
1,2,Coca Cola,"Neha Kakkar, Tony Kakkar",BollywoodDanceRomantic,Luka Chuppi,9.0/10,1
2,3,Apna Time Aayega,Ranveer Singh,BollywoodDance,Gully Boy,4.7/10,1
3,4,Mungda,"Jyotica Tangri, Shaan, Subhro Ganguly",BollywoodDance,Total Dhamaal,9.1/10,1
4,5,Tere Bin,"Asees Kaur, Rahat Fateh Ali Khan, Tanishk Bagchi",BollywoodRomantic,Simmba,3.2/10,1
5,6,Gali Gali,Neha Kakkar,BollywoodDance,KGF,9.1/10,1
6,7,Chamma Chamma,"Arun, Ikka, Neha Kakkar, Romy",BollywoodDance,Fraud Saiyaan,9.2/10,1
7,8,Mere Gully Mein,Ranveer Singh,BollywoodDance,Gully Boy,9.5/10,1


In [14]:
df.groupby('UserID').SongID.nunique().min()  #maximum number of hold out portion 

1

In [15]:
df.replace(np.nan,0)

Unnamed: 0,SongID,SongName,Singer,Genre,Movie,UserRating,UserID
0,1,Aankh Marey,"Kumar Sanu, Mika Singh, Neha Kakkar",BollywoodDance,Simmba,6.8/10,1
1,2,Coca Cola,"Neha Kakkar, Tony Kakkar",BollywoodDanceRomantic,Luka Chuppi,9.0/10,1
2,3,Apna Time Aayega,Ranveer Singh,BollywoodDance,Gully Boy,4.7/10,1
3,4,Mungda,"Jyotica Tangri, Shaan, Subhro Ganguly",BollywoodDance,Total Dhamaal,9.1/10,1
4,5,Tere Bin,"Asees Kaur, Rahat Fateh Ali Khan, Tanishk Bagchi",BollywoodRomantic,Simmba,3.2/10,1
...,...,...,...,...,...,...,...
2415,2416,Jana Tumhare Pyar Mein,Mukesh,BollywoodDance,Sasural,8.5/10,2373
2416,2417,Tum Jaise Bigde Babu Se,Lata Mangeshkar,BollywoodDance,Jab Pyar Kisi Se Hota Hai,8.5/10,2373
2417,2418,O Yaad Nahi Bhool Gaya,"Lata Mangeshkar, Suresh Wadkar",BollywoodDance,Lamhe,8.5/10,2373
2418,2419,Ladi Re Ladi Tujhse Aankh Jo Ladi,Jagjit Kaur,BollywoodDance,Shola Aur Shabnam,8.5/10,2373


In [19]:
def train_test_split(df, holdout_num):
    df = df.sort_values([ 'UserID'])
    
    # perform deep copy on the dataframe to avoid modification on the original dataframe
    df_train = df.copy(deep=True)
    df_test = df.copy(deep=True)
    
    df_test = df_test.groupby(['UserID']).head(holdout_num).reset_index()  # get test set
    df_train = df_train.merge(df_test[['UserID', 'SongID']].assign(remove=1),
                              how='left').query('remove != 1').drop('remove', 1).reset_index(drop=True)                              #get test set
     
    assert len(df) == len(df_train) + len(df_test)  #checking for not losing/duplicating data
    return df_train, df_test

In [20]:
df_train, df_test = train_test_split(df, 10)

## 3. Done for negative sampling

There is not compulsary to have always positive number or ratings(label) that's why negative sampling is used.

In [21]:
def negative_sampling(user_ids, song_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label
    
    @param user_ids: list of user ids
    @param song_ids: list of song ids
    @param items: unique list of song ids
    @param n_neg: number of negative labels to sample
    
    @return df_neg: negative sample dataframe
    
    """
    
    neg = []
    ui_pairs = zip(user_ids, song_ids)
    records = set(ui_pairs)
    
    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            # if the randomly sampled movie exists for that user
            j = np.random.choice(items)
            while(u, j) in records:
                # resample
                j = np.random.choice(items)
            neg.append([u, j, 0])
    # conver to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['UserID', 'SongID', 'UserRating'])
    
    return df_neg

In [22]:
neg_train = negative_sampling( user_ids=df_train.UserID.values, 
                               song_ids=df_train.SongID.values,
                               items=df.SongID.unique(),
                                n_neg=5
                             ) # create negative samples for training set

In [23]:
print(f'created {neg_train.shape[0]:,} negative samples')

created 10,090 negative samples


In [24]:
df_train = df_train[['UserID', 'SongID']].assign(UserRating=1)
df_test = df_test[['UserID', 'SongID']].assign(UserRating=1)

df_train = pd.concat([df_train, neg_train], ignore_index=True)

## 4.Model training

In [25]:
def get_unique_count(df):
    return df.UserID.nunique(), df.SongID.nunique() #count of unique user and songs

In [26]:
get_unique_count(df) #total count of unique user and song

(42, 2420)

In [27]:
print('training set shape', get_unique_count(df_train))
print('testing set shape', get_unique_count(df_test))

training set shape (39, 2418)
testing set shape (42, 402)


In [28]:
n_user, n_item = get_unique_count(df_train)
print("number of unique users", n_user) #number of unique user
print("number of unique items", n_item) # number of unique song

number of unique users 39
number of unique items 2418


In [35]:
%store n_user  
%store n_item

Stored 'n_user' (int)
Stored 'n_item' (int)


## 5. Preprocess data and upload them onto S3

In [36]:
session = boto3.session.Session()
region = session.region_name
print(f'currently in {region}') # get current session region

currently in us-east-1


In [37]:
sagemaker_session = sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()
print(bucket_name)

sagemaker-us-east-1-452957727655


**upload data to the S3 bucket**

In [41]:
# save data locally first
dest = 'bollywoodsongs/s3'
train_path = os.path.join(dest, 'train.npy')
test_path = os.path.join(dest, 'test.npy')
!mkdir {dest}
np.save(train_path, df_train.values)
np.save(test_path, df_test.values)

# upload to S3 bucket for train
sagemaker_session.upload_data(train_path, key_prefix='data')

mkdir: cannot create directory ‘bollywoodsongs/s3’: File exists


's3://sagemaker-us-east-1-452957727655/data/train.npy'

In [42]:
# upload to S3 bucket for test
sagemaker_session.upload_data(test_path, key_prefix='data')

's3://sagemaker-us-east-1-452957727655/data/test.npy'