# Load the dataset

In [5]:
import pandas as pd

dataset = pd.read_csv('songdata.csv')
dataset

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \nLet the angels fly l...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \nMore power \nPower to...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \nis something i'll believe \nf...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \nam i frightened \nwhere can ...


In [34]:
# Select a random sample of 5000 rows from the dataset
df = dataset.sample(n=5000)
# Drop the "link" column permanently from the dataframe
df.drop("link", axis=1, inplace=True)
# Reset the index of the dataframe, dropping the old index
df.reset_index(drop=True)


Unnamed: 0,artist,song,text
0,Hillsong United,Kingdom Come,Your love reaches out to me \nYour grace has ...
1,Will Smith,Jaden's Interlude,Jada: What are you doing? \nJaden: I'm just l...
2,Gary Numan,Me! I Disconnect From You,The alarm rang for days \nYou could tell from...
3,Leo Sayer,Something Fine,Papers lie there helplessly \nIn a pile outsi...
4,Grateful Dead,Big Boss Man,"Big boss man, can't you hear me when I call? ..."
...,...,...,...
4995,Rick Astley,Move Right Out,She comes home in the morning light \nTries t...
4996,HIM,Too Long To Love,In fear of somthing so unheard of \nShe tries...
4997,Queen,Patience,Shed a tear 'cause I'm missing you \nI'm stil...
4998,Beautiful South,Love Adjourned,Your wife's got beautiful skin \nShame it's n...


# Data Cleaning

In [35]:
# Convert the 'text' column to lowercase and remove leading non-alphanumeric characters
df['text'] = df['text'].str.lower().replace('^[a-zA-Z0-9]','')

In [36]:
# Remove any word characters followed by whitespace characters
df['text'] = df['text'].replace('\w\s','')

In [37]:
# Remove newline characters
df['text'] = df['text'].replace(r'\n','', regex=True)

In [38]:
df['text'].reset_index(drop=True, inplace=True)

In [39]:
df['text']

0       your love reaches out to me  your grace has ma...
1       jada: what are you doing?  jaden: i'm just loo...
2       the alarm rang for days  you could tell from c...
3       papers lie there helplessly  in a pile outside...
4       big boss man, can't you hear me when i call?  ...
                              ...                        
4995    she comes home in the morning light  tries to ...
4996    in fear of somthing so unheard of  she tries t...
4997    shed a tear 'cause i'm missing you  i'm still ...
4998    your wife's got beautiful skin  shame it's not...
4999    i've been living my life in a cage  freedom sp...
Name: text, Length: 5000, dtype: object

In [40]:
df['text'][0]

'your love reaches out to me  your grace has made a way to you  made a way to you  your word lives inside of me  your truth is life to all who hear  life to all who hear  we live for you  live your truth  may your kingdom come  and your will be done  as we serve your heart  serve your heart  let salvation flow  as your people pray  lord, we long for more  long for more, yeah  in you death is overcome  no power can stand against your name  the power of your name  in faith we will rise to be  your hands and feet to all the earth  life to all the earth  we live for you  live your truth  may your kingdom come  and your will be done  as we serve your heart  serve your heart  let salvation flow  as your people pray  lord, we long for more  long for more, yeah  [incomprehensible]  [incomprehensible]  [incomprehensible]  we live for you  live your truth  we long for you  to see you again  may your kingdom come  and your will be done  as we serve your heart  serve your heart  let salvation flow

In [41]:
df['text'].shape

(5000,)

# Tokenization and Stemming

In [42]:
# tokenize and stem words in a given text using the Porter Stemmer from NLTK.
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Santosh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [43]:
ps = PorterStemmer()

stemming = []
def tokenizer(text):
    
    tokens = nltk.word_tokenize(text)
    stemed_word = [ps.stem(word) for word in tokens]
        
    return ' '.join(stemed_word) 

In [44]:
tokeniked_word = nltk.word_tokenize('i am good boy. it is goodlooking painting')
tokeniked_word

['i', 'am', 'good', 'boy', '.', 'it', 'is', 'goodlooking', 'painting']

In [45]:
stemming= []
for words in tokeniked_word:
    stemed = ps.stem(words)
    stemming.append(stemed)
    ' '.join(stemming)
stemming

['i', 'am', 'good', 'boy', '.', 'it', 'is', 'goodlook', 'paint']

# Potential Error

* The error message LookupError: Resource punkt not found. means that the NLTK Punkt tokenizer is not installed. 
* The Punkt tokenizer is a tool that is used to split text into sentences and tokens.

In [46]:
tokenizer("helo i am good. this pot is goodlooking. i love book. it is so lovely")

'helo i am good . thi pot is goodlook . i love book . it is so love'

# Description of Apply function and Lambda Function

* The code df['text'].apply(lambda x: tokenizer(x)) applies the function tokenizer() to each element in the text column of the DataFrame df. The lambda function is a Python function that allows you to create a function on the fly. In this case, the lambda function takes a single argument, x, and returns the result of calling the tokenizer() function on x.


* The apply() method applies a function to each element in a DataFrame or Series. In this case, the apply() method is applying the tokenizer() function to each element in the text column of the DataFrame df.


* The output of the apply() method is a new DataFrame or Series that contains the results of applying the function to each element in the original DataFrame or Series. In this case, the output of the apply() method is a new DataFrame that contains the results of tokenizing each element in the text column of the DataFrame df.


* Here is an example of how to use the apply() method to tokenize the text in a DataFrame:

```Python
import pandas as pd
from nltk.tokenize import word_tokenize

# Create a DataFrame
df = pd.DataFrame({'text': ['This is a sentence.', 'This is another sentence.']})

# Tokenize the text in the DataFrame
tokenized_df = df['text'].apply(lambda x: word_tokenize(x))

# Print the tokenized DataFrame
print(tokenized_df)
```

* The apply() method is a powerful tool for applying functions to DataFrames and Series. It can be used to perform a variety of tasks, such as cleaning data, transforming data, and creating new features.

In [17]:
# apply this function to the df['text']

from tqdm import tqdm
tqdm.pandas()
df['text'].progress_apply(tokenizer)

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:14<00:00, 333.94it/s]


0       when white cloud go sailin ' to make my wed go...
1       let me introduc myself i 'm a woman that you '...
2       lone man upon the shore what on earth are you ...
3       i love to worship you , my god i love to worsh...
4       turn around you say 'scuse the 'tude but i hav...
                              ...                        
4995    well , your railroad gate , you know i just ca...
4996    everyth around is natur do n't fight it do n't...
4997    underneath the sky of red is a storytel sleep ...
4998    [ vers 1 ] togeth , togeth , togeth everyon to...
4999    just when everi ray of hope wa gone i should h...
Name: text, Length: 5000, dtype: object

In [47]:
type(df['text'])
type(df)

pandas.core.frame.DataFrame

# Text Vectorization and Cosine Similarity

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(df['text'])
matrix.shape

(5000, 24882)

In [50]:
similarity = cosine_similarity(matrix)
similarity

array([[1.        , 0.0303888 , 0.02988224, ..., 0.07806906, 0.10787166,
        0.08032631],
       [0.0303888 , 1.        , 0.03089626, ..., 0.05650621, 0.03916779,
        0.04278121],
       [0.02988224, 0.03089626, 1.        , ..., 0.06115006, 0.06137702,
        0.10741548],
       ...,
       [0.07806906, 0.05650621, 0.06115006, ..., 1.        , 0.09631003,
        0.12654159],
       [0.10787166, 0.03916779, 0.06137702, ..., 0.09631003, 1.        ,
        0.11912992],
       [0.08032631, 0.04278121, 0.10741548, ..., 0.12654159, 0.11912992,
        1.        ]])

# Recommend Songs

In [51]:
df['song'].reset_index(drop=True)

0                    Kingdom Come
1               Jaden's Interlude
2       Me! I Disconnect From You
3                  Something Fine
4                    Big Boss Man
                  ...            
4995               Move Right Out
4996             Too Long To Love
4997                     Patience
4998               Love Adjourned
4999         The Monster Is Loose
Name: song, Length: 5000, dtype: object

In [52]:
# 1. get the most repeated songs
df[df['song'] =='The Big Sky']
df[df['song'] =='EThe Big Sky']
df[df['song'] =='The Big Sky']
songs_list = sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6]
songs_list

songs_index = []
def rec():
    for i in songs_list:
        songs_index.append(df.iloc[i[0]].song)
        
    return songs_index
rec() 

# df.iloc[0].song

['I Live To Know You',
 'I Will Run To You',
 "I've Come To Serve",
 'One More Broken Heart',
 'One More Song For You']

In [53]:
def recommend(song):
    idx = df[df['song'] == song]
    songs_list = sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6]
    songs_name = []
    for i in songs_list:
        
        songs_name.append(df.iloc[i[0]].song)
        
    return songs_name
    

In [54]:
recommend('I Pulled My Groin')

['I Live To Know You',
 'I Will Run To You',
 "I've Come To Serve",
 'One More Broken Heart',
 'One More Song For You']

In [55]:
import pickle 

# Pickle

Converting a Python object into a string representation that can be stored or transmitted. Deserialization is the process of converting a serialized string representation back into a Python object.Pickling is a useful way to store Python objects in a database, cache them in memory, or send them over a network. It can also be used to create snapshots of Python objects that can be restored later.

To pickle an object, you can use the pickle.dumps() function. The pickle.dumps() function takes a Python object as input and returns a serialized string representation of the object.

To deserialize an object, you can use the pickle.loads() function. The pickle.loads() function takes a serialized string representation of an object as input and returns the Python object.


In [57]:
# Save the dataframe to a pickle file
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)

In [58]:
# Pickle the object
my_object = {"name": "Alice", "age": 25}

pickled_object = pickle.dumps(my_object)
pickled_object

b'\x80\x04\x95\x1c\x00\x00\x00\x00\x00\x00\x00}\x94(\x8c\x04name\x94\x8c\x05Alice\x94\x8c\x03age\x94K\x19u.'

In [59]:
# Deserialze the object
deserialized_object = pickle.loads(pickled_object)

# Print the deserialized object
print(deserialized_object)


{'name': 'Alice', 'age': 25}


In [100]:
# Serialized Object using dump method
with open('rupesh.pkl' ,'wb') as f:
    pickle.dump(df,f)

In [60]:
# Deserialized Object to read the contents
with open('df.pkl', 'rb') as f:
    dataFrame = pickle.load(f)

In [61]:
dataFrame

Unnamed: 0,artist,song,text
35988,Hillsong United,Kingdom Come,your love reaches out to me your grace has ma...
56500,Will Smith,Jaden's Interlude,jada: what are you doing? jaden: i'm just loo...
6533,Gary Numan,Me! I Disconnect From You,the alarm rang for days you could tell from c...
41501,Leo Sayer,Something Fine,papers lie there helplessly in a pile outside...
34441,Grateful Dead,Big Boss Man,"big boss man, can't you hear me when i call? ..."
...,...,...,...
17610,Rick Astley,Move Right Out,she comes home in the morning light tries to ...
7909,HIM,Too Long To Love,in fear of somthing so unheard of she tries t...
49349,Queen,Patience,shed a tear 'cause i'm missing you i'm still ...
24887,Beautiful South,Love Adjourned,your wife's got beautiful skin shame it's not...


In [65]:
# dump the similarity matrix 
pickle.dump(df, open('df.pkl', 'wb'))
# Similarily dump the similarity matrix 
pickle.dump(similarity, open('song_similarity_matrix.pkl', 'wb'))

In [66]:
df = pickle.load(open('df.pkl','rb'))
df['song'].values

array(['Kingdom Come', "Jaden's Interlude", 'Me! I Disconnect From You',
       ..., 'Patience', 'Love Adjourned', 'The Monster Is Loose'],
      dtype=object)

In [67]:
import pandas as pd
print(pd.__version__)

1.4.2


In [68]:
import sys
memory_uses = sys.getsizeof(similarity)
memory_use = memory_uses/1024/1024
memory_use

190.7349853515625

In [69]:
import zlib

In [70]:
compressed_array = zlib.compress(similarity)
compressed_array

In [71]:
compressed_size = sys.getsizeof(compressed_array)
compressed_size/1024/1024

179.37175941467285