# Lyrics Graph

Create a weighted undirected Graph in order to load it in [Gephi](https://gephi.org/) and use its ForcedAtlas algorithm.

Graph:
* Nodes: Words; weights proportional to corpus frequency.
* Edge weights: Relationship measure based on TF-IDF column vector.

In [1]:
import os
import os.path
import json
from itertools import combinations, islice

In [2]:
import pandas as pd
import seaborn as sns

import numpy as np
from scipy.spatial import distance

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import networkx as nx # Graph library

## Load Data

In [4]:
data_path = os.path.join('..','..','04_week','project','data')
json_files = [file for file in os.listdir(data_path) if file.endswith('.json')]

In [5]:
lyrics = []
for json_file in json_files:
    with open(os.path.join(data_path,json_file), encoding='ascii') as file:
        lyrics.extend(json.load(file))

In [6]:
df = pd.DataFrame(data=lyrics)
df.drop(['artist_searched'], axis=1, inplace=True)
df.head()

Unnamed: 0,artist_found,song_title,lyrics
0,Creedence Clearwater Revival,Born on the Bayou,Now when I was just a little boy \nStandin' to...
1,Creedence Clearwater Revival,Travelin' Band,Seven thirty seven comin' out of the sky\nWon'...
2,Creedence Clearwater Revival,Up Around the Bend,There's a place up ahead and I'm goin'\nJust a...
3,Creedence Clearwater Revival,Fortunate Son,"Some folks are born, made to wave the flag\nOo..."
4,Creedence Clearwater Revival,Down on the Corner,Early in the evenin' just about supper time\nO...


In [7]:
df.shape

(1401, 3)

## TF-IDF Transformation

In [8]:
corpus = df['lyrics']

# ignore words with numbers, thanks to https://stackoverflow.com/a/29375664
token_pattern = r"(?u)\b[^\d\W]{2,}\b"

vectorizer = TfidfVectorizer(token_pattern=token_pattern, max_features=200)
X = vectorizer.fit_transform(corpus)

In [9]:
#sns.heatmap(X.todense(), vmax=0.1)

In [10]:
X.shape

(1401, 200)

In [11]:
X_ = X.T

In [12]:
def row_vector(index, X):
    """ Helper function to turn numpy matrix into modern numpy arrays """
    return np.array(X[index].todense())[0]

## Graph

### Init

In [13]:
G = nx.Graph()

### Nodes

In [14]:
df_tf = pd.DataFrame(data=X.todense(), columns=vectorizer.get_feature_names())
df_tf.head()

Unnamed: 0,about,again,ah,ain,all,alone,always,am,an,and,...,woman,won,world,would,wrong,ya,yeah,yes,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103702,...,0.0,0.0,0.0,0.0,0.0,0.172737,0.0,0.0,0.0,0.0
1,0.0,0.068674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024364,...,0.0,0.235928,0.0,0.0,0.0,0.0,0.196661,0.0,0.095786,0.070782
2,0.0,0.0,0.0,0.0,0.0,0.0,0.050299,0.0,0.05114,0.072573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.109836,0.0,0.053497,0.026355
3,0.0,0.0,0.0,0.765512,0.036863,0.0,0.0,0.0,0.0,0.039159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.079019,0.0,0.038487,0.0
4,0.042855,0.0,0.0,0.0,0.024997,0.0,0.0,0.0,0.0,0.212431,...,0.0,0.042855,0.0,0.0,0.0,0.0,0.0,0.0,0.069596,0.205714


In [15]:
words_count = X_.shape[0]
for word_id in range(words_count):
    word = vectorizer.get_feature_names()[word_id]
    weight = np.sum(row_vector(word_id, X_))
    G.add_node(word_id, weight=weight, label=word)

### Edges

In [16]:
words_count = X_.shape[0]
for a,b in combinations(range(words_count),2):
    word_a = row_vector(a, X_)
    word_b = row_vector(b, X_)
    weight = np.sum( word_a*word_b )
    if weight > 0.01:
        G.add_edge(a, b, weight=weight)

In [17]:
nx.write_graphml(G, "lyrics_g2.graphml", encoding='utf-8', prettyprint=False)