In [2]:
import pandas as pd
import numpy as np
import requests as rq
import bs4
import re
import tqdm
import time
import json
import math

# RNN
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

pd.set_option("max.columns", 100)

This project is an attempt to predict a Pitchfork review rating (ranging from 0 to 10 with steps of 0.1) using an RNN trained on the review's content (with Tensorflow/Keras). For that, I initially collected all reviews from Pitchfork's website, cleaned the data, used GloVe embeddings with 50 dimensions to transform words into vectors, and finally trained several models, choosing the one with the best performance.

#### 1) Get data from all reviews available on Pitchfork's website

In [None]:
# Define headers for API request
headers = {
    'Accept': 'application/json',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Kanguage': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
    'Host': 'pitchfork.com',
    'Referer': 'https://pitchfork.com/reviews/albums/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}

In [None]:
# Function to get JSON review data
def get_reviews_json(url, start=0):
    url_search = url.format(size=size, start=start)
    return rq.get(url_search, headers=headers).json()

In [None]:
url = 'https://pitchfork.com/api/v2/search/?types=reviews&hierarchy=sections%2Freviews%2Falbums%2Cchannels%2Freviews%2Falbums&sort=publishdate%20desc%2Cposition%20asc&size={size}&start={start}'
size = 200

# Get total_count
reviews_json = get_reviews_json(url)
total_count = pd.json_normalize(reviews_json)['count'][0]
reviews_raw_df = pd.json_normalize(pd.json_normalize(reviews_json)['results.list'][0])

for i in tqdm.tqdm(range(200, total_count, 200)):
    reviews_json = get_reviews_json(url, start=i)
    time.sleep(2)
    aux_df = pd.json_normalize(pd.json_normalize(reviews_json)['results.list'][0])
    reviews_raw_df = pd.concat([reviews_raw_df, aux_df]).reset_index(drop=True)

100%|██████████| 115/115 [13:13<00:00,  6.90s/it]


In [None]:
reviews_raw_df.shape

(23091, 25)

Info collected for each review:
- URL
- Artists
- Album
- Music Label
- Genres
- Author
- Review
- Rating

In [None]:
# Assemble and process relevant df
reviews_df = pd.DataFrame()
reviews_df['url'] = reviews_raw_df['url'].copy()
reviews_df['artists'] = ''
reviews_df['album'] = reviews_raw_df['seoTitle'].copy()
reviews_df['label'] = ''
reviews_df['genres'] = ''
reviews_df['author'] = ''
reviews_df['review'] = ''
reviews_df['rating'] = ''

In [None]:
# Function to access, save and return each review's content
def get_and_save_review_content(num, save=False):
    url_review = 'https://pitchfork.com'+reviews_df.iloc[num,0]
    resp = rq.get(url_review)
    
    # Save HTML
    if save:
        name = reviews_df.iloc[num,0][16:-1]
        with open("./Reviews/{}.html".format(name), 'w+', encoding='utf-8') as output:
            output.write(resp.text)
    
    # Retrieve content
    parsed_html = bs4.BeautifulSoup(resp.text)
    description_test = parsed_html.find('div', attrs={'class': 'review-detail__abstract'})
    content_test = parsed_html.find('div', attrs={'class': re.compile(r"contents")})
    
    # If first attempt to access page fails, try again until successful
    while (description_test == None) | (content_test == None):
        time.sleep(5)
        resp = rq.get(url_review)
        parsed_html = bs4.BeautifulSoup(resp.text)
        description_test = parsed_html.find('div', attrs={'class': 'review-detail__abstract'})
        content_test = parsed_html.find('div', attrs={'class': re.compile(r"contents")})
        
    description = parsed_html.find('div', attrs={'class': 'review-detail__abstract'}).get_text().strip()
    content_raw = parsed_html.find('div', attrs={'class': re.compile(r"contents")}).get_text().strip()

    # Remove irrelevant text
    if re.search('\n\n', content_raw) != None:
        content_limit = re.search('\n\n', content_raw).span()[0]
        content = content_raw[:content_limit]
    else:
        content = content_raw
    
    review = description + " " + content
    
    return review

In [None]:
start = 0
for i in tqdm.tqdm(range(start,total_count)):
    
    aux_df = pd.json_normalize(reviews_raw_df.iloc[i,24])
    
    # Artists
    if reviews_raw_df.iloc[i,0] != []:
        artists = pd.json_normalize(reviews_raw_df.iloc[i,0])['display_name'].str.cat(sep=' ')
        reviews_df['artists'][i] = artists
    
    # Label
    if aux_df.empty == False:
        if pd.json_normalize(aux_df['labels_and_years'][0])['labels'][0] != []:
            label = pd.json_normalize(pd.json_normalize(aux_df['labels_and_years'][0])['labels'][0])['name'].str.cat(sep=' ')
            reviews_df['label'][i] = label
    
    # Genres
    if reviews_raw_df.iloc[i,1] != []:
        genres = pd.json_normalize(reviews_raw_df.iloc[i,1])['slug'].str.cat(sep=' ')
        reviews_df['genres'][i] = genres
    
    # Author
    author = pd.json_normalize(reviews_raw_df.iloc[i,12])['name'].str.cat(sep=' ')
    reviews_df['author'][i] = author
    
    # Review
    review = get_and_save_review_content(num=i)
    reviews_df['review'][i] = review
    
    # Rating
    if aux_df.empty == False:
        rating = aux_df['rating.rating'][0]
        reviews_df['rating'][i] = rating

In [None]:
# Drop duplicates
reviews_df = reviews_df.drop_duplicates().reset_index(drop=True)

# Replace blank values with NaN and drop rows with NaNs on either review or rating columns
reviews_df = reviews_df.replace(r'^\s*$', np.nan, regex=True)
reviews_df.dropna(subset=['review', 'rating'], how='any', axis=0, inplace=True)
reviews_df = reviews_df.reset_index(drop=True)

# Save relevant df
reviews_df.to_csv('./reviews_df.csv', encoding='utf-8')

In [None]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23088 entries, 0 to 23090
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      23088 non-null  object
 1   artists  22311 non-null  object
 2   album    23079 non-null  object
 3   label    23050 non-null  object
 4   genres   20757 non-null  object
 5   author   23088 non-null  object
 6   review   23088 non-null  object
 7   rating   23088 non-null  object
dtypes: object(8)
memory usage: 1.6+ MB


#### 2) Clean review data

In [None]:
# Function to remove extra whitespace
def remove_whitespace(series):
    series = series.str.replace("\n"," ")
    series = series.str.replace(" +"," ")
    series = series.str.strip()
    return series

# Function to remove uppercase and specific punctuation
def normalize(series):
    series = series.str.lower()
    series = series.str.replace(r"\xa0|\\xbd|\\|\/|\"|\“|\”|\-|\,|\—|\;|\:|\.|\?|\!|\(|\)|\_|\*"," ")
    series = series.str.strip()
    return series

In [None]:
reviews_df['review'] = remove_whitespace(normalize(remove_whitespace(reviews_df['review'])))

# Save clean df version
reviews_df.to_csv('./reviews_clean_df.csv', encoding='utf-8')

#### 3) Load data (whenever needed) and explore it a little



In [28]:
reviews_df = pd.read_csv('./reviews_clean_df.csv', encoding='utf-8', index_col=0)

X = reviews_df['review'].astype(str).copy()
#Y = reviews_df['rating'].copy()/10                                  # Used for sigmoid activation output (regression RNN)
Y_explore = (reviews_df['rating'].copy()).astype(int)
Y = np.asarray(Y_explore)                                            # Used for softmax activation output (multi-class classification RNN)

In [32]:
Y_explore.value_counts()*100/len(Y_explore)

7     42.370165
6     22.224629
8     17.706935
5      8.558929
4      3.378525
9      2.585871
3      1.593971
2      0.762334
10     0.498116
1      0.203578
0      0.116949
Name: rating, dtype: float64

Very few observations with low ratings. That means the RNN possibly won't be able to learn the particularities of these type of reviews and won't predict much on this lower range.

#### 4) Word embedding with GloVe 50D (with review padding whenever necessary)

In [4]:
# Function to read GloVe file and define useful dictionaries
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('./glove.6B.50d.txt')

Where:  
- word_to_index: dictionary mapping from words to their indices in the vocabulary 
- index_to_word: dictionary mapping from indices to their corresponding words in the vocabulary
- word_to_vec_map: dictionary mapping words to their GloVe vector representation.

In [5]:
# Function to set maximum review length. If max_possible=True, use number of words of the longest review
def set_max_review_length(n, max_possible=False):
    if max_possible:
        return len(max(X, key=len).split())
    else:
        return n

For different models, I used different maximum review lengths

In [6]:
# Function that processes an array of sentences (X) and returns inputs (lists of indices) to the embedding layer
def sentences_to_indices(X, word_to_index, max_review_length, begin_end=False):
    m = X.shape[0]  # number of training examples
    X_indices = np.zeros((m, max_review_length))
    
    for i in range(m):
        sentence_words = X[i].split()
        if begin_end:
            half = int(max_review_length/2)
            sentence_words = sentence_words[:half] + sentence_words[-half:]
        
        j = 0
        for w in sentence_words[:max_review_length]:
            X_indices[i,j] = word_to_index.get(w, word_to_index['<UNK>'])
            j += 1
  
    return X_indices

In [None]:
# Test of function above
X1 = np.array([X[0], X[1]])
X1_indices = sentences_to_indices(X1, word_to_index, max_review_length)
X1_indices

array([[357267., 148909., 223945., ...,      0.,      0.,      0.],
       [357267.,  17044., 393303., ...,      0.,      0.,      0.]])

#### 5) Modeling

Split data into train, dev and test sets

In [7]:
# Define maximum review length and size of each split
max_review_length = set_max_review_length(1500, False)
X_indices = sentences_to_indices(X, word_to_index, max_review_length, begin_end=False)

num_train = 21000
num_dev = 1000
num_test = len(X) - num_train - num_dev

X_train, Y_train = X[:num_train].copy(), Y[:num_train].copy()
X_dev, Y_dev = X[num_train:num_train+num_dev].copy(), Y[num_train:num_train+num_dev].copy()
X_test, Y_test = X[num_train+num_dev:].copy(), Y[num_train+num_dev:].copy()

X_indices_train = X_indices[:num_train].copy()
X_indices_dev = X_indices[num_train:num_train+num_dev].copy()
X_indices_test = X_indices[num_train+num_dev:].copy()

print(len(X_train), len(X_dev), len(X_test))

21000 1000 1087


Create embedding layer (function)

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):   
    # Adding 1 to fit Keras embedding (requirement)
    vocab_len = len(word_to_index) + 1
    embedding_dim = word_to_vec_map["the"].shape[0]  
    emb_matrix = np.zeros((vocab_len, embedding_dim))
    
    for word, idx in word_to_index.items():
        emb_matrix[idx,:] = word_to_vec_map[word]
        
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

Convert labels to one-hot vectors (function)

In [8]:
def convert_to_one_hot(Y, C):
    return np.eye(C)[Y.reshape(-1)]

Build model (function)

In [None]:
def review_model(input_shape, word_to_vec_map, word_to_index):
    # Input
    sentence_indices = Input(shape=input_shape, dtype='int32')
    # Embedding
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)   
    # LSTM
    X = LSTM(units=128, return_sequences=True)(embeddings)
    # Dropout, rate is the probability of zeroing nodes
    X = Dropout(rate=0.5)(X)
    # LSTM 2
    X = LSTM(units=128, return_sequences=True)(X)
    # Dropout 2
    X = Dropout(rate=0.5)(X)
    # LSTM 3
    X = LSTM(units=128, return_sequences=False)(X)
    # Dropout 3
    X = Dropout(rate=0.5)(X)
    # Dense
    X = Dense(units=11)(X)
    X = Activation('softmax')(X)
    review_model = Model(inputs=sentence_indices,outputs=X)

    return review_model

Set hyperparameters and specify model

In [None]:
# Learning rate decay schedule
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2,
    decay_steps=10000,
    decay_rate=0.9)
batch_size = 128
opt = Adam(learning_rate=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
loss = 'categorical_crossentropy'
checkpoint = ModelCheckpoint("./best_model_4.hdf5", monitor='accuracy', verbose=1,
    save_best_only=True, mode='auto', period=1)

def specify_model(model, opt, loss):
    model = model((max_review_length,), word_to_vec_map, word_to_index)
    model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
    return model

Convert labels to one-hot vectors

In [9]:
Y_train_oh = convert_to_one_hot(Y_train, C=11)
Y_dev_oh = convert_to_one_hot(Y_dev, C=11)
Y_test_oh = convert_to_one_hot(Y_test, C=11)

Train model

In [None]:
review_model = specify_model(review_model, opt, loss)
review_model.fit(X_indices_train, Y_train_oh, epochs=100, batch_size=batch_size, shuffle=True, verbose=1, 
                 callbacks=[checkpoint], validation_data=(X_indices_dev, Y_dev_oh))

Evaluate model (on train, dev or test data)

In [10]:
# Load different model (if needed)
review_model = load_model('./best_model_3.hdf5')

In [19]:
def evaluate_model(model, X_indices, Y, Y_oh):
    pred = model.predict(X_indices)
    
    pred_labels = []
    for p in pred:
        pred_labels.append(np.argmax(p))
    pred_labels = np.array(pred_labels)
    
    max_pred_rating = max(pred_labels)
    min_pred_rating = min(pred_labels)
    diff = np.absolute(Y-pred_labels)
    max_diff = np.max(diff)
    mse = np.sum((diff)**2)/num_dev
    rmse = np.sqrt(mse)
    loss, acc = review_model.evaluate(X_indices, Y_oh)
    print("MSE: {}\nRMSE: {}\nMax predicted rating: {}\nMin predicted rating: {}\nMax difference of ratings: {}".format(mse, rmse, max_pred_rating, min_pred_rating, max_diff))

In [None]:
# On train data
evaluate_model(review_model, X_indices_train, Y_train, Y_train_oh)

In [None]:
# On dev data
evaluate_model(review_model, X_indices_dev, Y_dev, Y_dev_oh)

In [None]:
# On test data
evaluate_model(review_model, X_indices_test, Y_test, Y_test_oh)

Save model

In [None]:
review_model.save("./review_model.hdf5")