In [36]:
import pandas as pd 
import numpy as np
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import os.path
import json

In [37]:
def read_data(path): 
    df = pd.read_json(path, lines = True)
    return df 

def write_file(path, file, df): 
    df = df.to_dict(orient='record')
    pathname = os.path.join(path, file)
    outfile = open(pathname, 'w')
    for instance in df:
        outfile.write(json.dumps(instance) + '\n')
    outfile.close()

In [38]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

stopwords = nltk.corpus.stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()

def get_relevant_words(text):
    return [ word for (word,tag) in text if tag.startswith("JJ") 
            or tag.startswith("RB") or tag.startswith("NN") or tag.startswith("VERB")]

In [39]:
def get_unigrams(df): 
    df['unigrams'] = df['text_tokens']
    return df 
    
def get_bigrams(df): 
    df['sentiment_bigrams'] = df['sentiment_unigrams'].apply(lambda row: list(nltk.ngrams(row, 2)))
    return df 

def get_sentiment(df): 
    sentiment_tokens = []
    for i in range(len(df)):
        lst = []
        if df['sentiment'][i] == 1: 
            output_pos = [i + '_pos' for i in df['unigrams'][i]]
            lst.append(output_pos)
        elif df['sentiment'][i] == 0:
            output_neg = [i + '_neg' for i in df['unigrams'][i]]
            lst.append(output_neg)
        else: 
            output_net = [i for i in df['unigrams'][i]]
            lst.append(output_net)
        sentiment_tokens.append(lst)
    df['sentiment_unigrams'] = sentiment_tokens
    df = flatten_list(df, 'sentiment_unigrams')
    return df
    
def flatten_list(df, column): 
    lst = []
    for i in df[column]:
        flatten_lst = []
        for j in i: 
            for item in j: 
                flatten_lst.append(item)
        lst.append(flatten_lst)
    df[column] = lst
    return df 

In [40]:
def preprocessing(path): 
    data = read_data(path)
    data = data.dropna()
    data['text']= data['text'].apply(lambda x:remove_punctuation(x))
    data['text']= data['text'].apply(lambda x: x.lower())
    data['unigrams'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
    data['unigrams']= data['unigrams'].apply(lambda x:remove_stopwords(x))
    data['unigrams']=data['unigrams'].apply(lambda x:lemmatizer(x))
    data = get_sentiment(data)
    data = get_bigrams(data)
    return data

In [41]:
source_B = preprocessing('data/extracted/source_B.json')
source_E = preprocessing('data/extracted/source_E.json')
source_P = preprocessing('data/extracted/source_P.json')

In [42]:
source_BP = preprocessing('data/extracted/source_BP.json')
source_EP = preprocessing('data/extracted/source_EP.json')
source_BE = preprocessing('data/extracted/source_BE.json')

In [43]:
source_BEP = preprocessing('data/extracted/source_BEP.json')

In [44]:
target_music = preprocessing('data/extracted/target_music.json')
source_music = preprocessing('data/extracted/source_music.json')

In [45]:
write_file('data/processed/', 'source_B.json', source_B)
write_file('data/processed/', 'source_E.json', source_E)
write_file('data/processed/', 'source_P.json', source_P)

write_file('data/processed/', 'source_BP.json', source_BP)
write_file('data/processed/', 'source_EP.json', source_EP)
write_file('data/processed/', 'source_BE.json', source_BE)

write_file('data/processed/', 'source_BEP.json', source_BEP)

In [46]:
write_file('data/processed/', 'target_music.json', target_music)
write_file('data/processed/', 'source_music.json', source_music)

------------