In [1]:
import pandas as pd
import numpy as np
import nltk
import json 
import os.path

In [17]:
def read_data(path): 
    df = pd.read_json(path, lines = True)
    return df 

def write_file(path, file, text): 
    pathname = os.path.join(path, file)
    outfile = open(pathname, 'w')
    for instance in text:
        outfile.write(json.dumps(instance) + '\n')
    outfile.close()
    
def flatten_list(df, column): 
    lst = []
    for i in df[column]:
        flatten_lst = []
        for j in i: 
            for item in j: 
                flatten_lst.append(item)
        lst.append(flatten_lst)
    df[column] = lst
    return df 

def tupleToList(df, column): 
    tupleToList = []
    for i in df[column]: 
        lst = [list(x) for x in i]
        tupleToList.append(lst)
    return tupleToList

def listToTuples(df, column):
    listToTuples = []
    for i in df[column]: 
        lst = []
        for j in i: 
            lst.append(tuple(j))
        listToTuples.append(lst)
    df[column] = listToTuples
    return df

def listOfTuplesToListofList(df, column): 
    listOfTuplesToListofList = []
    for i in df[column]: 
        lst = []
        for j in i: 
            lst.append(list(j)) 
        listOfTuplesToListofList.append(lst)
    df[column] = listOfTuplesToListofList
    return df 

def listToString(df, column): 
    listToString = []
    for i in df[column]:
        i = ', '.join(i)
        i = ' '.join(s for s in i.split() if not any(c.isdigit() for c in s))
        i = i.replace(',', '')
        listToString.append(i)
    return listToString

def remove_sentiment(df, column):
    lst = []
    for i in df[column]: 
        output = [j[:-4] if j.endswith('_pos') or j.endswith('_neg') else j for j in i] 
        lst.append(output)     
    df[column] = lst
    return df 

def get_unigramglossary_list(path): 
    glossary_list = read_data(path) 
    glossary_list = glossary_list.values.tolist()
    return glossary_list

def get_bigramglossary_list(path): 
    glossary_list = read_data(path) 
    glossary_list = listToTuples(thesaurus_list, 'similar')
    glossary_list['word'] = [tuple(i) for i in glossary_list['word']]
    glossary_list = glossary_list.values.tolist()
    return glossary_list

In [18]:
def count_matches(a, b):
    return len(set(a) & set(b))

def get_index_glossary(text): 
    res = []
    for i in range(len(glossary_list)): 
        result =  count_matches(text, glossary_list[i][1])
        if len(text) > 1: 
            result = (result/len(text))*100
            res.append(result)
        else: 
            result = (result/1)*100
            res.append(result)
            
    
    top_5_idx = np.argsort(res)[::-1][:5]
    top_5_values = [res[i] for i in top_5_idx] 


    res_dic = {}
    for key in top_5_idx:
        for value in top_5_values:
            res_dic[key] = value
            top_5_values.remove(value)
            break  
    return res_dic



In [19]:
def get_word_matches(df, column, threshold, glossary): 
    lst = []
    for i in df[column]: 
        potential_words = {}
        for k,v in i.items(): 
            k = int(k)
            if v >= threshold: 
                word = glossary[k][0]
                potential_words[k] = word
            else: 
                potential_words[k] = 'None'
        lst.append(potential_words)   

    all_words = []
    for i in lst:
        words = []
        for k,v in i.items():
            words.append(v)
        all_words.append(words) 
    
    new_allwords = []
    for i in all_words: 
        new_words = []
        for j in i: 
            if j != 'None': 
                new_words.append(j)
        new_allwords.append(new_words) 
    return new_allwords

In [20]:
def potential_unigrams(path1, threshold): 
    source = read_data(path1)
    print('Finding Matches ....')
    source['matches'] = source['sentiment_unigrams'].apply(lambda x:get_index_glossary(x))
    print('Finding Proposed Unigrams ...')
    source['proposed_unigrams'] = get_word_matches(source, 'matches', threshold, glossary_list)
    return source 

def potential_bigrams(df, threshold): 
    df = listToTuples(df, 'sentiment_bigrams')
    print('Finding matches bigrams....')
    df['matches_bigrams'] = df['sentiment_bigrams'].apply(lambda x:get_index_glossary(x))
    print('Finding proposed bigrams ...')
    df['proposed_bigrams'] = get_word_matches(df, 'matches_bigrams', threshold, glossary_list)
    return df 

In [21]:
def get_training_instances(df): 
    df = df.drop('matches', axis = 1)
    df = df.drop('matches_bigrams', axis = 1)
    df = listOfTuplesToListofList(df, 'proposed_bigrams')
    df = flatten_list(df, 'proposed_bigrams')
    df['train'] = df['unigrams'] + df['proposed_bigrams'] + df['proposed_unigrams']
    df = remove_sentiment(df, 'train')
    df['train'] = listToString(df, 'train')
    proposed_data = df.to_dict(orient='record')
    return proposed_data

#### Unigrams

In [22]:
glossary_list = get_unigramglossary_list('data/glossary/music_unigram_glossary.json')
target_music = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


In [23]:
glossary_list = get_unigramglossary_list('data/glossary/book_unigram_glossary.json')
target_music_B = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


In [24]:
glossary_list = get_unigramglossary_list('data/glossary/electronics_unigram_glossary.json')
target_music_E = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


In [25]:
glossary_list = get_unigramglossary_list('data/glossary/pet_unigram_glossary.json')
target_music_P = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


In [26]:
glossary_list = get_unigramglossary_list('data/glossary/BE_unigram_glossary.json')
target_music_BE = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


In [27]:
glossary_list = get_unigramglossary_list('data/glossary/BP_unigram_glossary.json')
target_music_BP = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


In [28]:
glossary_list = get_unigramglossary_list('data/glossary/EP_unigram_glossary.json')
target_music_EP = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


In [29]:
glossary_list = get_unigramglossary_list('data/glossary/multisource_unigram_glossary.json')
target_music_BEP = potential_unigrams('data/processed/target_music.json', 1)

Finding Matches ....
Finding Proposed Unigrams ...


#### Bigrams

In [33]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/music_bigram_glossary.json')
target_music = potential_bigrams(target_music, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


In [34]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/books_bigram_glossary.json')
target_music_B = potential_bigrams(target_music_B, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


In [35]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/electronics_bigram_glossary.json')
target_music_E = potential_bigrams(target_music_E, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


In [36]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/pet_bigram_glossary.json')
target_music_P = potential_bigrams(target_music_P, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


In [37]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/BE_bigram_glossary.json')
target_music_BE = potential_bigrams(target_music_BE, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


In [38]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/BP_bigram_glossary.json')
target_music_BP = potential_bigrams(target_music_BP, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


In [39]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/EP_bigram_glossary.json')
target_music_EP = potential_bigrams(target_music_EP, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


In [40]:
thesaurus_list = get_bigramthesaurus_list('data/glossary/multisource_bigram_glossary.json')
target_music_BEP = potential_bigrams(target_music_BEP, 0.1)

Finding matches bigrams....
Finding proposed bigrams ...


#### Appending matches

In [41]:
target_music_B = get_training_instances(target_music_B)

In [46]:
target_music_E = get_training_instances(target_music_E)
target_music_P = get_training_instances(target_music_P)

In [48]:
target_music_BE = get_training_instances(target_music_BE)
target_music_BP = get_training_instances(target_music_BP)
target_music_EP = get_training_instances(target_music_EP)

AttributeError: 'list' object has no attribute 'drop'

In [49]:
target_music_BEP = get_training_instances(target_music_BEP)

AttributeError: 'list' object has no attribute 'drop'

In [45]:
target_music = get_training_instances(target_music)

In [50]:
write_file('data/proposed/', 'target_music_B.json', target_music_B)
write_file('data/proposed/', 'target_music_E.json', target_music_E)
write_file('data/proposed/', 'target_music_P.json', target_music_P)

write_file('data/proposed/', 'target_music_BP.json', target_music_BP)
write_file('data/proposed/', 'target_music_BE.json', target_music_BE)
write_file('data/proposed/', 'target_music_EP.json', target_music_EP)

write_file('data/proposed/', 'target_music_BEP.json', target_music_BEP)
write_file('data/proposed/', 'target_music.json', target_music)

In [51]:
write_file('data/proposed/', 'target_music_B.json', target_music_B)

---------

### Test 1

In [148]:
#def get_training_test(df) :   
#    df = df.drop('matches', axis = 1)
#    df['train'] = df['unigrams'] + df['proposed_unigrams']
#    df = remove_sentiment(df, 'train')
#    df['train'] = listToString(df, 'train')
#    proposed_data = df.to_dict(orient='record')
#    return proposed_data 

In [188]:
#threshold = 6
#for i in range(1,8):
#    thesaurus_list = get_unigramthesaurus_list('test/multisource_unigram_glossary_test{}.json'.format(i))
#    target_music_BEP_test = potential_unigrams('data/processed/target_music.json', threshold)
#    target_music_BEP_test = get_training_test(target_music_BEP_test)
#    write_file('test/', 'target_music_BEP_test{}.json'.format(i), target_music_BEP_test)

### Test 2

In [52]:
#threshold = 6
#for i in range(1,8):
#    thesaurus_list = get_unigramthesaurus_list('test/multisource_unigram_glossary_test{}.json'.format(i))
#    target_music_BEP_anothertest = potential_unigrams('data/processed/target_music.json', threshold)
#    target_music_BEP_anothertest = get_training_test(target_music_BEP_anothertest)
#    write_file('test/', 'target_music_BEP_anothertest{}.json'.format(i), target_music_BEP_anothertest)