TO DO:
- version w/o lemmatizing?
- reusable item for importing into pipeline 

In [6]:
#imports and set your path 
import pandas as pd
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import PlaintextCorpusReader, wordnet
path = '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/'

Part 1: 
- extract needed info from MJI and musoW dumps 
- format, align and remove dupes
- bounce to text  

In [2]:
#read mji csv and turn into two column df for names and desc 
df_mji = pd.read_csv(path+'MJI/MJI_data.csv', keep_default_na=False, dtype='string')
df_mji_small = df_mji.iloc[:, [0, 5]].copy()
df_mji_small['Title'] = df_mji_small['Title'].str.lower().str.strip()
df_mji_small['Description'] = df_mji_small['Description'].str.lower().str.strip()

In [3]:
#read musow json dump and turn into df w/ same columns 
with open(path+'MUSOW/musow_name_desc_url_cat.json') as file:
    data = json.load(file)
    
musow_names = [result['name']['value'].strip().lower() for result in data['results']['bindings']]
musow_desc = [result['description']['value'].strip().lower() for result in data['results']['bindings']]
df_musow = pd.DataFrame(columns=['Title', 'Description'])
df_musow['Title'] = musow_names
df_musow['Description'] = musow_desc
df_musow = df_musow.astype('string')
df_musow.to_pickle(path+'KEYWORDS/musoW_keywords.pkl')

In [4]:
#remove duplicates from MJI set based on title field 
df_mji_small[~df_mji_small['Title'].isin(df_musow['Title'])].dropna()
df_mji_small.to_pickle(path+'KEYWORDS/MJI_keywords.pkl')

In [5]:
#save each df to a single text file for processing 
with open(path+'KEYWORDS/mji_corpus.txt', 'a') as f:
    dfAsString = df_mji_small.to_string(header=False, index=False)
    f.write(dfAsString)
with open(path+'KEYWORDS/musow_corpus.txt', 'a') as f:
    dfAsString = df_musow.to_string(header=False, index=False)
    f.write(dfAsString)

Part 2:
- text processing using nltk: remove punctuations, tokenize, remove nltk stopwords + custom list, lemmatize 
- get top 35 keywords 
- get bigrams in which each word is in the top 35 kws 

In [7]:
#set corpus variables + vars for cleaning 
mji_corpus = PlaintextCorpusReader(path+'KEYWORDS/mji_corpus.txt', '.*\.txt')
musow_corpus = PlaintextCorpusReader(path+'KEYWORDS/musow_corpus.txt', '.*\.txt')
stopwords = nltk.corpus.stopwords.words('english')
punct_tokenizer = nltk.RegexpTokenizer(r"\w+")
custom_stopwords = ['available', '000', 'including', 'also', 'includes', 'website', 'new', 'include', 'well', 'based', 'source', 'sources', 'contains', 'search']

In [8]:
#function to extract top 35 keywords and return them alongside bigrams that include two words within top 35
def keywords(input_corpus):
    string = input_corpus.raw('')
    tokenised = punct_tokenizer.tokenize(string)
    clean = [w for w in tokenised if w not in stopwords]
    clean_2 = [w for w in clean if w not in custom_stopwords]
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w) for w in clean_2]
    freqdist_lem = nltk.FreqDist(lemmatized)
    most_common_lem = freqdist_lem.most_common(35)
    most_common_lem_list = []
    for t in most_common_lem:
        most_common_lem_list.append(t[0])
    bigrams = nltk.bigrams(lemmatized)
    freqdist_bg = nltk.FreqDist(bigrams)
    search_bigrams = []
    for k, v in freqdist_bg.items():
        if k[0] in most_common_lem_list:
            if k[1] in most_common_lem_list:
                if v > 5:
                    k = ' '.join(k)
                    search_bigrams.append([k, v])
    kw_pd = pd.DataFrame(columns=['Most Common KW', 'KW Freq', 'Bigrams', 'Bigram Freq'])
    kw_pd['Most Common KW'] = pd.Series([w[0] for w in most_common_lem])
    kw_pd['KW Freq'] = pd.Series([w[1] for w in most_common_lem])
    kw_pd['Bigrams'] = pd.Series([b[0] for b in search_bigrams])
    kw_pd['Bigram Freq'] = pd.Series([b[1] for b in search_bigrams])
    return kw_pd

In [13]:
#process each and pickle 
musowkw = keywords(musow_corpus)
musowkw.to_pickle(path+'KEYWORDS/musowkw.pkl')
mjikw = keywords(mji_corpus)
mjikw.to_pickle(path+'KEYWORDS/mjikw.pkl')


Part 3:
- Analyze results, differences 
- Create reusable item for needed results 

In [14]:
#see which set has unique bigrams? 
differences = musowkw['Bigrams'].compare(mjikw['Bigrams'])
differences

Unnamed: 0,self,other
0,early music,oral history
1,sheet music,history project
2,music collection,music oral
3,collection online,african american
4,music library,american music
5,library digital,audio video
6,digital score,sound archive
7,digital edition,country music
8,song dataset,digital collection
9,music information,archive feature


In [89]:
#concat both sets and see differences? 
musowkw['Source'] = 'musow'
mjikw['Source'] = 'mji'
different = pd.concat([musowkw, mjikw]).drop_duplicates(keep=False)
different

Unnamed: 0,Most Common KW,KW Freq,Bigrams,Bigram Freq,Source
21,file,65,collection music,6.0,musow
22,sound,62,sound recording,17.0,musow
23,material,58,musical score,7.0,musow
24,edition,57,digitized manuscript,6.0,musow
25,composer,57,collection digital,6.0,musow
26,information,56,music score,11.0,musow
27,dataset,56,manuscript score,6.0,musow
28,digitized,47,century music,9.0,musow
29,jazz,47,music database,15.0,musow
30,american,46,university library,8.0,musow
