In [216]:
!pip3 install requests



### request
-requests is one of the most downloaded Python libraries.It allows you to send http requests to websites.

In [217]:
import requests as rq

In [218]:
response = rq.get('https://www.lyrics.com/artist/Shakira/174707') #create a request object ,with this I can download:

In [219]:
text = response.text

In [220]:
#text #HTML page

### (Simplistic) Overview of status codes:

200 = successful

300 = redirect

Everything with a 4 in the beginning will be an error on your side
400 = Error
401 = Not authorized
404 = webpage cannot be found

Everything with a 5 in the beginning will be an error on the server side.

In [221]:
response.status_code

200

### Regular Expressions are a powerful language for matching patterns in text. In this exercise, we will use an online tool that helps exploring a pattern.


In [222]:
import re 
pattern = '\/lyric\/\d+\/Shakira/[^"]+' #with this pattern i can extract the list of lyrics:
shakira_links = re.findall(pattern,text) #here the search is taking place
#print(shakira_links)

In [223]:
links = []
for link in shakira_links:
    links.append("https://www.lyrics.com" + link)

### BeautifulSoup is a Python library that helps parse HTML code for you, so you don’t have to write a lot of complex regular expressions yourself.

In [224]:
from bs4 import BeautifulSoup
import pandas as pd

In [225]:
df = pd.DataFrame(columns = ['lyric', 'singer']) #make a empty dataframe with columns (we will put songs from both singers):
df

Unnamed: 0,lyric,singer


In [226]:
singer = "Shakira"
for link in range(200):  #took only 200 songs:
    html = rq.get(links[link]) #downloading the HTML page
    soup = BeautifulSoup(html.text) #parsing the HTML page
    lyric = soup.find('pre',attrs={'id':'lyric-body-text'}) #extracting the lyric 
    lyric = re.sub('<a.*?>|</a> ', '', str(lyric)) #cleaning the anchor text
    lyric = re.sub('<pre.*?>|</a> ', '', str(lyric)) 
    lyric = re.sub('</pre.*?>|</a> ', '', str(lyric))
    df = df.append({'lyric':lyric, 'singer':singer}, ignore_index=True)

In [229]:
#df.head()

In [228]:
#df.info #402 songs are here:

In [187]:
df.to_csv("shakira.csv", index = False, sep = '\t')

### 2nd Singer : Adele

In [188]:
#https://www.lyrics.com/artist/Adele/861756
response = rq.get('https://www.lyrics.com/artist/Adele/861756')

In [189]:
text = response.text

In [190]:
pattern = '\/lyric\/\d+\/Adele/[^"]+'
adele_links = re.findall(pattern,text)

In [191]:
links = []
for link in adele_links:
    links.append("https://www.lyrics.com" + link)

In [192]:
singer = "Adele"
for link in range(200):
    html = rq.get(links[link])
    soup = BeautifulSoup(html.text)
    lyric = soup.find('pre',attrs={'id':'lyric-body-text'})
    lyric = re.sub('<a.*?>|</a> ', '', str(lyric))
    lyric = re.sub('<pre.*?>|</a> ', '', str(lyric))
    lyric = re.sub('</pre.*?>|</a> ', '', str(lyric))
    df = df.append({'lyric':lyric, 'singer':singer}, ignore_index=True)

In [193]:
df.to_csv("shakira_adele.csv", index = False, sep = '\t')

In [194]:
df.head()

Unnamed: 0,lyric,singer
0,Ladies up in here tonight\nNo fighting\nWe got...,Shakira
1,Mi vida me empezó a cambiar\nLa nocheque te co...,Shakira
2,Aquí estás\nYa no puedesdetenerte\n¿Dónde vas?...,Shakira
3,Aquí estás\nYa no puedesdetenerte\n¿Dónde vas?...,Shakira
4,Ladies up in here tonight\nNo fighting\nWe got...,Shakira


In [195]:
X = df['lyric']
y = df['singer']

### CountVectorizer is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.

In [196]:
from sklearn.feature_extraction.text import CountVectorizer #Counting text into numbers
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000)
cv = CountVectorizer()
cv.fit(X_train)

CountVectorizer()

In [197]:
X_train = cv.transform(X_train)
X_test  = cv.transform(X_test)

In [198]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.9875


In [199]:
from sklearn.naive_bayes import MultinomialNB

In [200]:
mnb =  MultinomialNB() 
mnb.fit(X_train, y_train)
score = mnb.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.925


In [201]:
mnb =  MultinomialNB() 
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.9875


### prediction (which lyric for which singer)

In [212]:
text = ["yellow submarine bla bla"]
c = cv.transform(text)
classifier.predict(c)
classifier.predict_proba(c)

array([[0.96637697, 0.03362303]])

In [213]:
classifier.predict(c) #predictions

array(['Adele'], dtype=object)

In [214]:
classifier.predict_proba(c) #probability

array([[0.96637697, 0.03362303]])