# Introduction to NLP

In [1]:
#import sys
#!conda install --yes --prefix {sys.prefix} nltk

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP.DESKTOP-
[nltk_data]     G8GFKGS\AppData\Roaming\nltk_data...


True

In [3]:
from nltk.corpus import wordnet as wn
poses = { 'n':'noun', 'v':'verb', 's':'adj(s)', 'a':'adj', 'r':'adv'}
# synsets: cognitive synonyms, each expressing a distinct concept
for synset in wn.synsets("general"):
    print("{}: {}".format(poses[synset.pos()], ", ".join([l.name() for l in synset.lemmas()])))

noun: general, full_general
noun: general, superior_general
noun: general
verb: general
adj: general
adj(s): general
adj(s): general
adj: general
adj(s): general
adj(s): cosmopolitan, ecumenical, oecumenical, general, universal, worldwide, world-wide


In [4]:
from nltk.corpus import wordnet as wn

general = wn.synset("general.n.1")

hyper = lambda s: s.hypernyms()
list(general.closure(hyper))

[Synset('general_officer.n.01'),
 Synset('commissioned_military_officer.n.01'),
 Synset('commissioned_officer.n.01'),
 Synset('military_officer.n.01'),
 Synset('serviceman.n.01'),
 Synset('skilled_worker.n.01'),
 Synset('worker.n.01'),
 Synset('person.n.01'),
 Synset('organism.n.01'),
 Synset('causal_agent.n.01'),
 Synset('living_thing.n.01'),
 Synset('physical_entity.n.01'),
 Synset('whole.n.02'),
 Synset('entity.n.01'),
 Synset('object.n.01')]

In [5]:
hypo = lambda s: s.hyponyms()
list(general.closure(hypo))

# Data Gathering

[Synset('blucher.n.01')]

In [7]:
nltk.download('omw-1.4') # Open Multilingual Wordnet: different languages 

for synset in wn.synsets("fermo", lang="ita"):
    print("{}: {}".format(poses[synset.pos()], ", ".join([l.name() for l in synset.lemmas()])))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP.DESKTOP-
[nltk_data]     G8GFKGS\AppData\Roaming\nltk_data...


noun: apprehension, arrest, catch, collar, pinch, taking_into_custody
noun: locking, lockup
adj(s): inactive, motionless, static, still
adj(s): firm, steadfast, steady, stiff, unbendable, unfaltering, unshakable, unwavering
noun: catch


# Gathering lyrics data

In this example we will use song lyrics in our analysis.
In python a couple of alternatives for lyrics scraping are

- lyricsgenius
- lyricsmaster 

you can install them using: 

In [7]:
# Install the package in the current Jupyter kernel
import sys
#!conda config --append channels conda-forge
#!conda install --yes --prefix {sys.prefix} lyricsgenius

!{sys.executable} -m pip install lyricsgenius



## Lyricsgenius

In [12]:
# Get a API client-key at http://genius.com/api-clients  
import lyricsgenius

genius = lyricsgenius.Genius("your-api-secret")

artist='Michael Jackson'

g_artist = genius.search_artist(artist, max_songs=2)

Searching for songs by Michael Jackson...

Song 1: "Billie Jean"
Song 2: "Smooth Criminal"

Reached user-specified song limit (2).
Done. Found 2 songs.


In [26]:
dir(g_artist.songs[0])[32:]

['annotation_count',
 'api_path',
 'artist',
 'full_title',
 'header_image_thumbnail_url',
 'header_image_url',
 'id',
 'lyrics',
 'lyrics_owner_id',
 'lyrics_state',
 'path',
 'primary_artist',
 'pyongs_count',
 'save_lyrics',
 'song_art_image_thumbnail_url',
 'song_art_image_url',
 'stats',
 'title',
 'title_with_featured',
 'to_dict',
 'to_json',
 'to_text',
 'url']

In [27]:
g_artist.songs[0].lyrics

'247 ContributorsTranslationsEspañolPortuguêsHebrewEnglishBillie Jean Lyrics[Verse 1]\nShe was more like a beauty queen from a movie scene\nI said, "Don\'t mind, but what do you mean, I am the one\nWho will dance on the floor in the round?"\nShe said I am the one\nWho will dance on the floor in the round\nShe told me her name was Billie Jean as she caused a scene\nThen every head turned with eyes that dreamed of bein\' the one\nWho will dance on the floor in the round\n\n[Pre-Chorus 1]\nPeople always told me, "Be careful of what you do\nDon\'t go around breakin\' young girls\' hearts" (Hee-hee)\nAnd mother always told me, "Be careful of who you love\nAnd be careful of what you do (Oh-oh)\n\'Cause the lie becomes the truth" (Oh-oh), hey-ey\n\n[Chorus]\nBillie Jean is not my lover, uh\nShe\'s just a girl who claims that I am the one (Oh, baby)\nBut the kid is not my son (Hoo)\nShe says I am the one (Oh, baby)\nBut the kid is not my son (Hee-hee-hee, no-no, hee-hee-hee, hoo)\n\n[Verse 2]\

In [37]:
import re
import string 

regex = re.compile('[%s]' % re.escape(string.punctuation))

txt1 = regex.sub(' ', g_artist.songs[0].lyrics[75:])
txt2 = txt1.replace('\n', ' ')
txt2

' Verse 1  She was more like a beauty queen from a movie scene I said   Don t mind  but what do you mean  I am the one Who will dance on the floor in the round   She said I am the one Who will dance on the floor in the round She told me her name was Billie Jean as she caused a scene Then every head turned with eyes that dreamed of bein  the one Who will dance on the floor in the round   Pre Chorus 1  People always told me   Be careful of what you do Don t go around breakin  young girls  hearts   Hee hee  And mother always told me   Be careful of who you love And be careful of what you do  Oh oh   Cause the lie becomes the truth   Oh oh   hey ey   Chorus  Billie Jean is not my lover  uh She s just a girl who claims that I am the one  Oh  baby  But the kid is not my son  Hoo  She says I am the one  Oh  baby  But the kid is not my son  Hee hee hee  no no  hee hee hee  hoo    Verse 2  For forty days and for forty nights  the law was on her side But who can stand when she s in demand  Her s

In [16]:
artists=['ABBA','David_Bowie','Janis_Joplin','Michael_Jackson','Queen','Rolling_Stones','The_Clash','Bob_Dylan','Elton_John','Led_Zeppelin','Pink_Floyd','Ramones','The_Beatles','The_Doors']

data = {}
for i, a in enumerate(artists):
    with open("lyrics_top50/" + a + ".txt", "r") as file:
        data[a] = re.sub('\n', ' ', file.read()) # Remove new line \n  

In [17]:
data.keys()

dict_keys(['ABBA', 'David_Bowie', 'Janis_Joplin', 'Michael_Jackson', 'Queen', 'Rolling_Stones', 'The_Clash', 'Bob_Dylan', 'Elton_John', 'Led_Zeppelin', 'Pink_Floyd', 'Ramones', 'The_Beatles', 'The_Doors'])

In [38]:
data['ABBA'][:2000]

"[Verse 1] I, I've been in love before I thought I would no more Manage to hit the ceiling Still, strange as it seems to me You brought it back to me That old feeling  [Verse 2] I, I don't know what you do You make me think that you Possibly could release me I think you'll be able to Make all my dreams come true And you ease me  [Chorus] You thrill me, you delight me You please me, you excite me You're something I'd been pleading for I love you, I adore you I've laid my life before you I'll have you want me more and more And finally it seems my lonely days are through I've been waiting for you  [Verse 3] I, I'm gonna make you mine You're gonna feel so fine You'll never want to leave me I feel you belong to me Someday you will agree Please, believe me  [Chorus] You thrill me, you delight me You please me, you excite me You're something I'd been pleading for I love you, I adore you I've laid my life before you I'll have you want me more and more And finally it seems my lonely days are th

In [39]:
import pandas as pd

data_df = pd.DataFrame(data,index=[0]).transpose()
data_df.columns=['lyrics']
data_df

Unnamed: 0,lyrics
ABBA,"[Verse 1] I, I've been in love before I though..."
David_Bowie,[Intro] [Verse 1] A small Jean Genie snuck of...
Janis_Joplin,"[Intro] Oh, come on, come on, come on, come on..."
Michael_Jackson,"[Verse 1] Your butt is mine, gonna tell you ri..."
Queen,[Verse 1] I can dim the lights and sing you so...
Rolling_Stones,[Intro] What a drag it is getting old [Verse ...
The_Clash,Stay around don't play around This old town an...
Bob_Dylan,[Verse 1] Go away from my window Leave at your...
Elton_John,[Verse 1] Can you hear it in the distance? Can...
Led_Zeppelin,[Intro] Hey That's right [Verse 1] Asked swee...


In [None]:
data_df.to_csv('lyrics.csv')