Preparation of Pitchfork data into OHCO format

Simran Batra

DS 5559

# Configuration

In [1]:
db_file = 'pitchfork.db'
para_pat = r'\n'
token_pat = r'([\W_]+)'

## Libraries

In [2]:
import glob
import sqlite3

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Simran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Simran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Simran\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Simran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
%matplotlib inline

# Preparing Data

## Loading the data

In [4]:
pitchfork = pd.read_csv("pitchfork.csv", encoding="ISO-8859-1")
pitchfork = pitchfork.iloc[:,1:]

In [5]:
pitchfork.head()

Unnamed: 0,Title,Artist,Score,Author,Genre,Date,Text
0,Girls,Yung Baby Tate,6.8,Michelle Kim,Rap,March 5 2019,"As a little girl, roleplaying is an essential ..."
1,"Wasteland, Baby!",Hozier,4.8,Sam Sodomsky,Rock,March 6 2019,Like a desperate magician guessing card after ...
2,Rap or Go to the League,2 Chainz,7.6,Sheldon Pearce,Rap,March 6 2019,2 Chainz formed his rap group Playaz Circle in...
3,American Love Call,Durand Jones & the Indications,6.6,Amanda Wicks,Pop/R&B,March 5 2019,Soul is likely not the first word that comes t...
4,Good at Falling,The Japanese House,7.5,Megan Buerger,Pop/R&B,March 4 2019,When Amber Bain began releasing music as the J...


## Setting up OHCO index

In [6]:
OHCO = ['Title', 'Artist', 'Score', 'Date','Genre', 'Author', 'para_num', 'sent_num', 'token_num']
TITLE = OHCO[:1]
ARTIST = OHCO[:2]
SCORE = OHCO[:3]
DATE = OHCO[:4]
GENRE = OHCO[:5]
AUTHOR = OHCO[:6]
PARAS = OHCO[:7]
SENTS = OHCO[:8]

In [7]:
try:
    pitchfork = pitchfork.set_index(AUTHOR)
    pitchfork = pitchfork.sort_index()
except KeyError:
    pass

In [8]:
pitchfork.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Text
Title,Artist,Score,Date,Genre,Author,Unnamed: 6_level_1
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,The Calgary-based band Women had the enviable ...
"""Evidence"" 12""",Carlos Giffoni,7.6,February 13 2012,Experimental,Nick Neyland,In Bill Brewster and Frank Broughton's Last Ni...
"""What Is This Heart?""",How to Dress Well,8.8,June 23 2014,Pop/R&B,Ian Cohen,How to Dress Well is a project that Tom Krell ...
"""Witchhunt Suite for WWIII""",Ariel Pink's Haunted Graffiti,7.8,September 23 2011,Experimental,Marc Masters,"Since it first went public in the early 2000s,..."
$,Mark Sultan,6.8,April 21 2010,Electronic,Stephen M. Deusner,Mark Sultan doesn't simply revive old sounds a...


### Create stopwords list 

In [9]:
sw = nltk.corpus.stopwords.words('english')

### Cleaning up text

In [10]:
pitchfork.Text = pitchfork.Text.str.replace(r"(—|-)", ' \g<1> ')

## Getting paragraphs

In [11]:
paras = pitchfork.Text.str.split(para_pat, expand=True)\
    .stack()\
    .to_frame()\
    .rename(columns={0:'para_str'})
paras.index.names = PARAS
paras.para_str = paras.para_str.str.strip()
paras.para_str = paras.para_str.str.replace(r'\n', ' ')
paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
paras = paras[~paras.para_str.str.match(r'^\s*$')]

In [12]:
paras.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,para_str
Title,Artist,Score,Date,Genre,Author,para_num,Unnamed: 7_level_1
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,The Calgary - based band Women had the enviabl...
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,1,Theres something welcoming and likeable about...
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,2,Viet Cong's songs are complex puzzles. Its li...
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,3,"The cover of Bauhaus Dark Entries, which so..."
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,4,"What ""Cassette"" ultimately does is throw down ..."


## Getting sentences

In [13]:
sents = paras.para_str\
    .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'sent_str'})
sents.index.names = SENTS
del(paras)

In [14]:
sents.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,sent_str
Title,Artist,Score,Date,Genre,Author,para_num,sent_num,Unnamed: 8_level_1
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,0,The Calgary - based band Women had the enviabl...
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,1,"Both their albums, Women and Public Strain, ha..."
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,2,"Women ended in acrimony and tragedy, but two o..."
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,3,"""Cassette"" is their scrappy introduction to th..."
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,4,"Now it's been buffed up and cleaned out, and w..."


## Getting tokens

In [15]:
tokenizer = RegexpTokenizer('\s+', gaps=True)

In [16]:
tokens = sents.sent_str\
    .apply(lambda x: pd.Series(nltk.pos_tag(tokenizer.tokenize(x))))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'pos_tuple'})
tokens.index.names = OHCO
tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
tokens = tokens.drop('pos_tuple', 1)
del(sents)

In [17]:
tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')

In [18]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,pos,token_str,punc,num
Title,Artist,Score,Date,Genre,Author,para_num,sent_num,token_num,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,0,0,DT,The,0,0
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,0,1,NNP,Calgary,0,0
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,0,2,:,-,1,0
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,0,3,VBN,based,0,0
"""Cassette"" EP",Viet Cong,7.8,July 7 2014,Rock,Nick Neyland,0,0,4,NN,band,0,0


## Vocab configuration

### Tag punctuation and numbers and extract vocab with minimal normalization

In [19]:
WORDS = (tokens.punc == 0) & (tokens.num == 0)
tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\
    .str.replace(token_pat, '')
#     .str.replace(r'["_*.\']', '')
vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
    .reset_index()\
    .rename(columns={'index':'term_str', 'term_str':'n'})
vocab = vocab.sort_values('term_str').reset_index(drop=True)
vocab.index.name = 'term_id'

  raw_cell, store_history, silent, shell_futures)


### Get priors for Vocab

In [20]:
vocab['p'] = vocab.n / vocab.n.sum()

### Add stems

In [21]:
stemmer = nltk.stem.porter.PorterStemmer()
vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

### Define stopwords

In [22]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [23]:
sw = pd.DataFrame({'x':1}, index=stopwords)
vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
del(sw)

### Add term_ids to Tokens

In [24]:
tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
    .set_index('term_str').term_id).fillna(-1).astype('int')

# Save as SQL file

In [25]:
with sqlite3.connect(db_file) as db:
    pitchfork.to_sql('doc', db, if_exists='replace', index=True)
    tokens.to_sql('token', db, if_exists='replace', index=True)
    vocab.to_sql('vocab', db, if_exists='replace', index=True)