In [1]:
import pandas as pd # type: ignore

In [3]:
df = pd.read_csv("spotify_millsongdata.csv")

In [4]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [5]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [6]:
df.shape

(57650, 4)

In [7]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [8]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [9]:
df.head(10)

Unnamed: 0,artist,song,text
0,Norah Jones,Don't Know Why,I waited 'til I saw the sun \r\nI don't know ...
1,Enrique Iglesias,Somebody's Me,"You, do you remember me? \r\nLike, I remember..."
2,Michael W. Smith,Above All,Above all powers \r\nAbove all kings \r\nAbo...
3,Mariah Carey,Me. I Am Mariah... The Elusive Chanteuse,On the back cover of this album is a personal ...
4,Within Temptation,Jillian,I've Been Dreaming For So Long \r\nTo Find Th...
5,Josh Groban,O Come All Ye Faithful,"O come let us adore him, \r\nO come let us ad..."
6,Queens Of The Stone Age,If Only,If it gets you down well then I'll take it \r...
7,Bon Jovi,It's Just Me,J. Bon jovi \r\n \r\nYou know your favorite ...
8,Allman Brothers Band,Come On In My Kitchen,You better come on into my kitchen \r\nCause ...
9,Point Of Grace,Got To Be Time,So sad \r\nDepression deep inside has got you...


In [10]:
df['text'][0]

"I waited 'til I saw the sun  \r\nI don't know why I didn't come  \r\nI left you by the house of fun  \r\nI don't know why I didn't come  \r\nI don't know why I didn't come  \r\n  \r\nWhen I saw the break of day  \r\nI wished that I could fly away  \r\nInstead of kneeling in the sand  \r\nCatching teardrops in my hand  \r\n  \r\nMy heart is drenched in wine  \r\nBut you'll be on my mind  \r\nForever  \r\n  \r\nOut across the endless sea  \r\nI would die in ecstasy  \r\nBut I'll be a bag of bones  \r\nDriving down the road alone  \r\n  \r\nMy heart is drenched in wine  \r\nBut you'll be on my mind  \r\nForever  \r\n  \r\nSomething has to make you run  \r\nI don't know why I didn't come  \r\nI feel as empty as a drum  \r\nI don't know why I didn't come  \r\nI don't know why I didn't come\r\n\r\n"

In [15]:
# df = df.sample(5000)

In [11]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [12]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [22]:
def simple_tokenization(txt):
    tokens = txt.split()  # Simple space-based tokenization
    stemming = [stemmer.stem(w) for w in tokens] # type: ignore
    return " ".join(stemming)
df['text'] = df['text'].apply(lambda x: simple_tokenization(x))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore
from sklearn.metrics.pairwise import cosine_similarity # type: ignore

In [16]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [17]:
similarity[0]

array([1.        , 0.00938648, 0.01034798, ..., 0.08951307, 0.15101842,
       0.10949705])

In [24]:
df[df['song'] == 'Come Back Home']

Unnamed: 0,artist,song,text
4999,Carly Simon,Come Back Home,"summer time, kid in the street we were up on t..."


In [19]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [25]:
recommendation('Come Back Home')

["I'm Gonna Do Somebody Right",
 'Clock Strikes Ten',
 "I AIN'T GONNA LOVE YOU ANYMORE",
 'Who Will You Run To',
 'Our Time Is Gonna Come',
 "When You're Gone",
 'Back In The Race',
 'Im Gonna Love You',
 "Lord I'm Gonna Love You",
 'How Do I Break It To My Heart',
 'No More Lies',
 'Be Alright',
 'Genuine Only',
 'Heart To Heart',
 'Adam Lives In Theory',
 "Darlin'",
 'Home',
 "I'm Gonna Have A Little Talk",
 'Take You Out',
 "I'm Gonna Be A Wheel Someday"]

In [21]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))

In [23]:
print(df['song'].unique())


["Don't Know Why" "Somebody's Me" 'Above All' ... 'In Repair'
 "There's A World" 'Come Back Home']
