In [7]:
import pandas as pd
import numpy as np
import nltk
import json 
import os.path

In [8]:
def read_data(path): 
    df = pd.read_json(path, lines = True)
    return df 

def write_file(path, file, text): 
    pathname = os.path.join(path, file)
    outfile = open(pathname, 'w')
    for instance in text:
        outfile.write(json.dumps(instance) + '\n')
    outfile.close()
    
def flatten_list(df, column): 
    lst = []
    for i in df[column]:
        flatten_lst = []
        for j in i: 
            for item in j: 
                flatten_lst.append(item)
        lst.append(flatten_lst)
    df[column] = lst
    return df 

def tupleToList(df, column): 
    tupleToList = []
    for i in df[column]: 
        lst = [list(x) for x in i]
        tupleToList.append(lst)
    return tupleToList

def listToTuples(df, column):
    listToTuples = []
    for i in df[column]: 
        lst = []
        for j in i: 
            lst.append(tuple(j))
        listToTuples.append(lst)
    df[column] = listToTuples
    return df

def listOfTuplesToListofList(df, column): 
    listOfTuplesToListofList = []
    for i in df[column]: 
        lst = []
        for j in i: 
            lst.append(list(j)) 
        listOfTuplesToListofList.append(lst)
    df[column] = listOfTuplesToListofList
    return df 

def listToString(df, column): 
    listToString = []
    for i in df[column]:
        i = ', '.join(i)
        i = ' '.join(s for s in i.split() if not any(c.isdigit() for c in s))
        i = i.replace(',', '')
        listToString.append(i)
    return listToString

def remove_sentiment(df, column):
    lst = []
    for i in df[column]: 
        output = [j[:-4] if j.endswith('_pos') or j.endswith('_neg') else j for j in i] 
        lst.append(output)     
    df[column] = lst
    return df 

def get_unigramglossary_list(path): 
    glossary_list = read_data(path) 
    glossary_list = glossary_list.values.tolist()
    return glossary_list

def get_bigramglossary_list(path): 
    glossary_list = read_data(path) 
    glossary_list = listToTuples(glossary_list, 'similar')
    glossary_list['word'] = [tuple(i) for i in glossary_list['word']]
    glossary_list = glossary_list.values.tolist()
    return glossary_list

In [9]:
def count_matches(a, b):
    return len(set(a) & set(b))

def get_index_glossary(text): 
    res = []
    for i in range(len(glossary_list)): 
        result =  count_matches(text, glossary_list[i][1])
        if len(text) > 1: 
            result = (result/len(text))*100
            res.append(result)
        else: 
            result = (result/1)*100
            res.append(result)
            
    
    top_5_idx = np.argsort(res)[::-1][:5]
    top_5_values = [res[i] for i in top_5_idx] 


    res_dic = {}
    for key in top_5_idx:
        for value in top_5_values:
            res_dic[key] = value
            top_5_values.remove(value)
            break  
    return res_dic



In [10]:
def get_word_matches(df, column, threshold, glossary): 
    lst = []
    for i in df[column]: 
        potential_words = {}
        for k,v in i.items(): 
            k = int(k)
            if v >= threshold: 
                word = glossary[k][0]
                potential_words[k] = word
            else: 
                potential_words[k] = 'None'
        lst.append(potential_words)   

    all_words = []
    for i in lst:
        words = []
        for k,v in i.items():
            words.append(v)
        all_words.append(words) 
    
    new_allwords = []
    for i in all_words: 
        new_words = []
        for j in i: 
            if j != 'None': 
                new_words.append(j)
        new_allwords.append(new_words) 
    return new_allwords

In [11]:
def potential_unigrams(path1, threshold): 
    source = read_data(path1)
    print('Finding Matches ....')
    source['matches'] = source['sentiment_unigrams'].apply(lambda x:get_index_glossary(x))
    print('Finding Proposed Unigrams ...')
    source['proposed_unigrams'] = get_word_matches(source, 'matches', threshold, glossary_list)
    return source 

def potential_bigrams(df, threshold): 
    df = listToTuples(df, 'sentiment_bigrams')
    print('Finding matches bigrams....')
    df['matches_bigrams'] = df['sentiment_bigrams'].apply(lambda x:get_index_glossary(x))
    print('Finding proposed bigrams ...')
    df['proposed_bigrams'] = get_word_matches(df, 'matches_bigrams', threshold, glossary_list)
    return df 

In [12]:
def get_training_instances(df): 
    df = df.drop('matches', axis = 1)
    df = df.drop('matches_bigrams', axis = 1)
    df = listOfTuplesToListofList(df, 'proposed_bigrams')
    df = flatten_list(df, 'proposed_bigrams')
    df['train'] = df['unigrams'] + df['proposed_bigrams'] + df['proposed_unigrams']
    df = remove_sentiment(df, 'train')
    df['train'] = listToString(df, 'train')
    proposed_data = df.to_dict(orient='record')
    return proposed_data

### Unigrams

In [13]:
threshold = 6

In [14]:
glossary_list = get_unigramglossary_list('data/glossary/music_unigram_glossary.json')
source_music = potential_unigrams('data/processed/source_music.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


In [15]:
glossary_list = get_unigramglossary_list('data/glossary/book_unigram_glossary.json')
source_B = potential_unigrams('data/processed/source_B.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


In [16]:
glossary_list = get_unigramglossary_list('data/glossary/electronics_unigram_glossary.json')
source_E = potential_unigrams('data/processed/source_E.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


In [17]:
glossary_list = get_unigramglossary_list('data/glossary/pet_unigram_glossary.json')
source_P = potential_unigrams('data/processed/source_P.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


In [18]:
glossary_list = get_unigramglossary_list('data/glossary/EP_unigram_glossary.json')
source_EP = potential_unigrams('data/processed/source_EP.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


In [19]:
glossary_list = get_unigramglossary_list('data/glossary/BE_unigram_glossary.json')
source_BE = potential_unigrams('data/processed/source_BE.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


In [20]:
glossary_list = get_unigramglossary_list('data/glossary/BP_unigram_glossary.json')
source_BP = potential_unigrams('data/processed/source_BP.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


In [21]:
glossary_list = get_unigramglossary_list('data/glossary/multisource_unigram_glossary.json')
source_BEP = potential_unigrams('data/processed/source_BEP.json', threshold)

Finding Matches ....
Finding Proposed Unigrams ...


### Bigrams

In [22]:
threshold = 0.1

In [23]:
glossary_list = get_bigramglossary_list('data/glossary/music_bigram_glossary.json')
source_music = potential_bigrams(source_music, threshold)

Finding matches bigrams....
Finding proposed bigrams ...


In [24]:
glossary_list = get_bigramglossary_list('data/glossary/books_bigram_glossary.json')
source_B = potential_bigrams(source_B, threshold)

Finding matches bigrams....
Finding proposed bigrams ...


In [25]:
glossary_list = get_bigramglossary_list('data/glossary/electronics_bigram_glossary.json')
source_E = potential_bigrams(source_E, threshold)

Finding matches bigrams....
Finding proposed bigrams ...


In [26]:
glossary_list = get_bigramglossary_list('data/glossary/pet_bigram_glossary.json')
source_P = potential_bigrams(source_P, threshold)

Finding matches bigrams....
Finding proposed bigrams ...


In [27]:
glossary_list = get_bigramglossary_list('data/glossary/EP_bigram_glossary.json')
source_EP = potential_bigrams(source_EP, threshold)

Finding matches bigrams....
Finding proposed bigrams ...


In [28]:
glossary_list = get_bigramglossary_list('data/glossary/BE_bigram_glossary.json')
source_BE = potential_bigrams(source_BE,threshold)

Finding matches bigrams....
Finding proposed bigrams ...


In [29]:
glossary_list = get_bigramglossary_list('data/glossary/BP_bigram_glossary.json')
source_BP = potential_bigrams(source_BP, threshold)

Finding matches bigrams....
Finding proposed bigrams ...


In [30]:
glossary_list = get_bigramglossary_list('data/glossary/multisource_bigram_glossary.json')
source_BEP = potential_bigrams(source_BEP, threshold)

Finding matches bigrams....
Finding proposed bigrams ...


### Appending Matches

In [31]:
source_music = get_training_instances(source_music)

In [32]:
source_B = get_training_instances(source_B)
source_E = get_training_instances(source_E)
source_P = get_training_instances(source_P)

In [33]:
source_BE = get_training_instances(source_BE)
source_EP = get_training_instances(source_EP)
source_BP = get_training_instances(source_BP)

In [34]:
source_BEP = get_training_instances(source_BEP)

In [35]:
write_file('data/proposed/', 'source_B.json', source_B)
write_file('data/proposed/', 'source_E.json', source_E)
write_file('data/proposed/', 'source_P.json', source_P)

write_file('data/proposed/', 'source_BE.json', source_BE)
write_file('data/proposed/', 'source_BP.json', source_BP)
write_file('data/proposed/', 'source_EP.json', source_EP)

write_file('data/proposed/', 'source_BEP.json', source_BEP)

In [36]:
write_file('data/proposed/', 'source_music.json', source_music)

------------

### Test 1 

In [198]:
#def get_training_test(df):    
#    df = df.drop('matches', axis = 1)
#    df = df.drop('sentiment_bigrams', axis = 1)
#    df['train'] = df['unigrams'] + df['proposed_unigrams']
#    df = remove_sentiment(df, 'train')
#    df['train'] = listToString(df, 'train')
#    proposed_data = df.to_dict(orient='record')
#    return proposed_data 

In [199]:
#threshold = 6
#for i in range(1,7):
#    glossary_list = get_unigramglossary_list('test/multisource_unigram_glossary_test{}.json'.format(i))
#    source_BEP_test = potential_unigrams('data/processed/source_BEP.json', threshold)
#    source_BEP_test = get_training_test(source_BEP_test)
#    write_file('test/', 'glossary_source_BEP_test{}.json'.format(i), source_BEP_test)

### Test 2

In [205]:
#threshold = 6
#for i in range(1,9):
#    glossary_list = get_unigramglossary_list('test/multisource_unigram_glossary_anothertest{}.json'.format(i))
#    source_BEP_anothertest = potential_unigrams('data/processed/source_BEP.json', threshold)
#    source_BEP_anothertest = get_training_test(source_BEP_anothertest)
#    write_file('test/', 'glossary_source_BEP_anothertest{}.json'.format(i), source_BEP_anothertest)