In [1]:
import pandas as pd
df = pd.read_csv("spotify_millsongdata.csv")
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [2]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [3]:
df.shape

(57650, 4)

In [4]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [5]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)
df.head(10)

Unnamed: 0,artist,song,text
0,Lionel Richie,Good Morning,"The sun creeps in, \r\nShe says I love you wi..."
1,Frank Zappa,"Dog Breath, In The Year Of The Plague",[includes music from the World's Greatest Sinn...
2,Leo Sayer,I Think We Fell In Love Too Fast,We were just two kids in high school \r\nWho'...
3,Hollies,Everything Is Sunshine,I keep on thinkin about the things that make m...
4,Etta James,It's Too Soon To Know,Does he love me? \r\nIt's too soon to know \...
5,Korn,Haze,Walking alone inside my world \r\nThinking I'...
6,Train,"Save Me, San Francisco",I used to love the tenderloin \r\nUntil I mad...
7,Spandau Ballet,Virgin,"I could tell you a fairytale, \r\nHold your h..."
8,Waterboys,Suffer,[Chorus] \r\nI'm gonna suffer for you no long...
9,Perry Como,A Garden In The Rain,"'Twas just a garden in the rain, \r\nClose to..."


In [6]:
df['text'][0]

"The sun creeps in,  \r\nShe says I love you with a smile on her face.  \r\nIf time could stand still  \r\nWe could lay under the covers all day  \r\n  \r\nShe makes this house a home  \r\nIn winter keeps me warm  \r\nIn summer makes it hot for me  \r\nIn the fall and spring fresh loving she brings  \r\nWhen I open up my eyes she sings  \r\n  \r\nGood morning  \r\nIt's a beautiful day  \r\nWhen I'm with her  \r\nWith me she lays  \r\n  \r\nGood morning  \r\nAnother beautiful day  \r\nThe sun rises  \r\nWhen I see your face  \r\nEveryday  \r\nGood morning  \r\nGood morning  \r\n  \r\nSometimes I wake  \r\nA little grumpy, moody  \r\nShe says no way  \r\nCause she's so positive  \r\nShe gives me love  \r\nGets me ready for my day  \r\n  \r\nShe makes this house a home  \r\nIn winter keeps me warm  \r\nIn summer makes it hot for me  \r\nIn the fall and spring fresh loving she brings  \r\nWhen I open up my eyes she sings  \r\n  \r\nGood morning  \r\nIt's a beautiful day  \r\nWhen I'm with 

In [7]:
df.shape

(5000, 3)

In [None]:
# Text Cleaning/ Text Preprocessing

In [8]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [9]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [12]:
import nltk
nltk.download('punkt_tab')

df['text'] = df['text'].apply(lambda x: tokenization(x))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/scorpion/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [15]:
similarity[0]

array([1.        , 0.00698778, 0.03626982, ..., 0.01812453, 0.12724716,
       0.04399197])

In [16]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text


In [21]:
def recommendation(song_df):
    if song_df not in df['song'].values:
        return f"Song '{song_df}' not found in the dataset."

    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

print(recommendation('Crying Over You'))

Song 'Crying Over You' not found in the dataset.


In [22]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))
