In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('lana_lyrics.csv')

In [3]:
df.dropna(inplace = True)

In [4]:
df['lyrics'][0]

'Text Book Lyrics[Verse 1]\nI guess you could call it textbook\nI was lookin\' for the father I wanted back\nAnd I thought I found it in Brentwood\nIt seemed only appropriate you\'d easily have my back\n\n[Pre-Chorus]\nAnd then there was the issue of her\nI didn\'t even like myself, or love the life I had\nAnd there you were with shining stars\nStandin\' blue with open arms\nYou touched the detriment most of the friends I knew already had\n\n[Chorus]\nYou\'ve got a Thunderbird, my daddy had one, too\nLet\'s rewrite history, I\'ll do this dance with you\nYou know I\'m not that girl, you know I\'ll never be\nMaybe just the way we\'re different could set me free\nAnd there we were, screamin\' "Black Lives Matter" in the crowd\nBy the Old Man River, and I saw you saw who I am\nGod, I wish I was with my father\nHe could see us in all our splendor\nAll the things I couldn\'t want for him\nI screamed for them, oh-oh-oh\nI screamed for them, ah-ah\n[Verse 2]\nCould we do this dance again?\nDo 

## Removendo caracteres indesejados

In [5]:
def remove_title(text):
    text = re.sub(r'^[^\[]+', '', text)
    return text

In [6]:
df['lyrics'] = df.lyrics.apply(remove_title)

In [7]:
def remove_ponctuation(text):
  
  text = re.sub(r'[^\w\s]', '', text)
  return text

In [8]:
df['lyrics'] = df.lyrics.apply(remove_ponctuation)

In [9]:
def remove_end(text):
  
  text = re.sub(r'[^\s]+$', '', text)
  return text

In [10]:
df['lyrics'] = df.lyrics.apply(remove_end)

In [11]:
def remove_linebreak(text):
  
  text = re.sub(r'\n', ' ', text)
  return text

In [12]:
df['lyrics'] = df.lyrics.apply(remove_linebreak)

In [13]:
def remove_verse(text):
  
  text = re.sub(r'Verse\s\d{1}\s', '', text)
  text = re.sub(r'PreChorus', '', text)
  text = re.sub(r'Chorus', '', text)
  text = re.sub(r'Lana', '', text)
  text = re.sub(r'Del', '', text)
  text = re.sub(r'Rey', '', text)
  text = re.sub(r'Ray', '', text)
  return text

In [14]:
df['lyrics'] = df.lyrics.apply(remove_verse)

In [15]:
df['lyrics'][0]

'I guess you could call it textbook I was lookin for the father I wanted back And I thought I found it in Brentwood It seemed only appropriate youd easily have my back   And then there was the issue of her I didnt even like myself or love the life I had And there you were with shining stars Standin blue with open arms You touched the detriment most of the friends I knew already had   Youve got a Thunderbird my daddy had one too Lets rewrite history Ill do this dance with you You know Im not that girl you know Ill never be Maybe just the way were different could set me free And there we were screamin Black Lives Matter in the crowd By the Old Man River and I saw you saw who I am God I wish I was with my father He could see us in all our splendor All the things I couldnt want for him I screamed for them ohohoh I screamed for them ahah Could we do this dance again Do you think if I go blonde we could get our old love back I guess this is really the end I never felt jealous before this yea

## Colocando em minúsculo

In [16]:
df['lyrics'] = df['lyrics'].str.lower()

## Tokenização

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rian2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
df['lyrics'] = df['lyrics'].apply(word_tokenize) 

## Removendo as stopwords

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rian2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stopwords = nltk.corpus.stopwords.words('english')

In [21]:
stopwords = ' '.join(stopwords).replace("'","").split()

In [22]:
def remove_stopwords(text):
    return [item for item in text if item not in stopwords]

In [23]:
df['lyrics'] = df.lyrics.apply(remove_stopwords)

## Lemmatização

In [24]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rian2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

In [27]:
df['lyrics'] = df.lyrics.apply(lemmatize_text)

## Ajustes finais

In [28]:
df['lyrics_str'] = np.nan

In [29]:
df['lyrics_str'] = df['lyrics'].apply(' '.join)

In [30]:
df.head()

Unnamed: 0,album,title,lyrics,lyrics_str
0,Blue Banisters,Text Book,"[guess, could, call, textbook, lookin, father,...",guess could call textbook lookin father wanted...
1,Blue Banisters,Blue Banisters,"[there, picture, wall, john, deere, jenny, han...",there picture wall john deere jenny handed bee...
2,Blue Banisters,Arcadia,"[body, map, la, stand, straight, like, angel, ...",body map la stand straight like angel halo han...
4,Blue Banisters,Black Bathing Suit,"[grenadine, quarantine, like, lot, la, hey, zo...",grenadine quarantine like lot la hey zoom targ...
5,Blue Banisters,If You Lie Down with Me,"[put, red, boot, baby, giddy, baby, want, danc...",put red boot baby giddy baby want dance baby g...


In [31]:
df.to_csv('lana_lyrics_clean.csv', index = False)