In [357]:
import numpy as np
import pandas as pd
import nltk
from nltk import bigrams
import scipy.sparse
import json 
import os.path

from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import bigrams
from collections import Counter, defaultdict
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2,SelectKBest
from sklearn import pipeline
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score

In [358]:
def read_data(path): 
    df = pd.read_json(path, lines = True)
    return df 

def write_file(path, file, text): 
    pathname = os.path.join(path, file)
    outfile = open(pathname, 'w')
    for instance in text:
        outfile.write(json.dumps(instance) + '\n')
    outfile.close()
    
def flatten_list(df, column): 
    lst = []
    for i in df[column]:
        flatten_lst = []
        for j in i: 
            for item in j: 
                flatten_lst.append(item)
        lst.append(flatten_lst)
    df[column] = lst
    return df 

def get_tuples(df, column): 
    lst = []
    for i in df[column]:
        list_tuples = []
        for j in i: 
            list_tuples.append(tuple(j)) 
        lst.append(list_tuples)
    df[column] = lst
    return df 

In [363]:
# Code inspiration from: https://github.com/isabellin105/word-embeddings/blob/master/HW_3.ipynb

In [359]:
def get_reviews(data, text):
    listTokenisedReviews = []
    for review in data[text]: 
        listTokenisedReviews.append(review)
    numTokenisedReviews = len(listTokenisedReviews)
    return listTokenisedReviews, numTokenisedReviews

def get_vocabulary(tokenized, minimum, maximum): 
    #Function. Input: tokenized documents. Returns: a dictionary of vocabulary and word-index loopup dictionary 
    #Function. Takes only frequently appearing words into consideration. 
    #Parameters: (1) tokenized: a list of tokenised strings
    #Parameters: (2)minimum: minimum unigram counts
    #Parameters: (3)maximum: maximum unigram count
    #Output: (1) vocab: a counter where vocab[word] is the count of unigrams's occurrence in all documents 
    #Output: (2) word2ind: a word to index loopup dictionary for words in vobaculary dictionary 
    #Output: (3) idx2word: a index to word loopup dictionary for words in vobaculary dictionary 
    vocab = Counter()
    for sentence in tokenized:
        for word in sentence: 
            vocab[word] += 1
    print('%d vocabs before' % len(vocab))
    for sentence in tokenized:
        for word in sentence: 
            if vocab[word] < minimum or vocab[word] > maximum:
                del vocab[word]
    print('%d vocabs after' % len(vocab))
    
    word2idx = {}
    idx2word = {}
    i = 0
    while i < len(vocab):
        for word in vocab.keys():
            word2idx[word]=i
            idx2word[i]=word
            i += 1
    return vocab, word2idx, idx2word

def get_context(tokenized, vocab, windowsize): 
    #Function. 
    #Input: a list of tokenized strings 
    #Output: counter of word pairs, counting the number of times a word occurs in the context of another word 
    #Function only takes the words that are within the vocabulary into consideration 
    #Function builds three separate counters: word_pair, word1_count and word2_count 
    #Parameters: (1) tokenized: a list of tokenized strings 
    #Parameters: (2) vocab: vocabulary counter 
    #Parameters: (3) windowsize: context window size
    #Return: (1) word_pair: counter where word_pair_count[(word1, word2)] is the count of word2 occurrence in word1 context window 
    #Return: (2) word1_count: counter where word1_count[w] is the number of times word1 occurred in the review 
    #Return: (3) word2_count: conter where word2_count[c] is the number of times word2 occurred in the review 

    word_pair_count = Counter()
    word1_count = Counter()
    word2_count = Counter()

    for sentences in tokenized: 
        for i in np.arange(len(sentences)): 
            if sentences[i] in vocab.keys(): 
                target_index = i 
                windowstart = target_index - windowsize
                windowend = target_index + windowsize 
                if windowstart < 0: 
                    windowstart = 0 
                if windowend > len(sentences) -1: 
                    windowend = len(sentences)-1
                for j in np.arange(windowstart, windowend+1):
                    if j != target_index and sentences[j] in vocab.keys():
                        word_pair_count[(sentences[i], sentences[j])] += 1
                        word1_count[sentences[i]]+=1
                        word2_count[sentences[j]]+=1
    print("There are {} word-context pairs".format(len(word_pair_count)))
    return word_pair_count, word1_count, word2_count

def get_PMI(word_pair_count, word1_count, word2_count, word2idx): 
    #Function, which returns the Positive Pointwise Mututal information of words in a vocabulary 
    #Input: word_pair_count: a Counter where word_pair_count[(w, c)] = count of c's occurences in w's context window
    #Input: word1_count: counter where word1_count[w] = the number of times word1 occured in the documents
    #Input: word2_count: counter where word2_count[w] = the number of times word2 occured in the documents
    #Input: word2idx: a word-index loopup dictionary for word in a vocabulary 
    #Output: Positive Pointwise Mutual Information, a sparse matrix 
    new_data = []
    rows = []
    cols = []
    total_occurences = sum(word_pair_count.values())
    word1_occurences = sum(word1_count.values())
    word2_occurences = sum(word2_count.values())
    
    for (w,c), n in word_pair_count.items():
        p_word1 = word1_count[w]/word1_occurences
        p_word2 = word2_count[c]/word2_occurences
        PMI = np.math.log2((n/total_occurences)/(p_word1*p_word2))
        new_data.append(max(0,PMI))
        rows.append(word2idx[w])
        cols.append(word2idx[c])

    PPMI = csc_matrix((new_data, (rows, cols)))
    return PPMI 

In [365]:
def get_embeddings(PPMI, rank):
    #Function. Returns the left singular vectors as word embeddings. 
    #Function. Using truncated SVD. 
    #Input: PPMI: a sparce matrix of Positive Pointwise Information 
    #Input: rank: number of singular values and vectors to compute 
    #Return: u: left singular vectors from sparce SVD 
    #Returns: s: singular values from sparse SVD 
    u, s, vt = svds(PPMI, k=rank)
    return u, s

def cosine_distances(matrix, vector):
    distances = []
    for v in matrix:
        distances.append(np.dot(vector,v)/(np.linalg.norm(vector)*np.linalg.norm(v)))
    return  distances


def nearest_neighbors(embeddings, word, k, word2idx, idx2word):
    vector = embeddings[word2idx[word]]
    distances = cosine_distances(embeddings, vector)
    nearest_neighbors = []

    idx = np.asarray(distances).argsort()[-(k+1):][::-1]
    for i in idx:
        if i != word2idx[word]:
            nearest_neighbors.append(idx2word[i])
    return nearest_neighbors

In [366]:
def get_cooccurrance_matrix(df, column, maximum, minimum, windowsize): 
    listTokenisedReviews, numTokenisedReviews = get_reviews(df, column)
    vocab, word2idx, idx2word = get_vocabulary(listTokenisedReviews, minimum, maximum)
    word_pair_count, word1_count, word2_count = get_context(listTokenisedReviews, vocab, windowsize)
    PPMI = get_PMI(word_pair_count, word1_count, word2_count, word2idx)
    return word2idx, idx2word, vocab, PPMI

def get_glossary(path, column, maximum, minimum, windowsize, num_neighbors): 
    df = read_data(path)
    df = listToTuples(df)
    word2idx, idx2word, vocab, PPMI = get_cooccurrance_matrix(df, column, maximum, minimum, windowsize)
    rank = 20
    embeddings, _ = get_embeddings(PPMI, rank)
    embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True) 
    glossary = dict()
    for words in vocab.keys(): 
        glossary[words] = nearest_neighbors(embeddings, words, num_neighbors, word2idx, idx2word)
    glossary = pd.DataFrame(glossary.items(), columns=['word', 'similar'])
    glossary = glossary.to_dict(orient='record')
    return glossary

In [362]:
def listToTuples(df):
    listToTuples = []
    for i in df['sentiment_bigrams']: 
        lst = []
        for j in i: 
            lst.append(tuple(j))
        listToTuples.append(lst)
    df['sentiment_bigrams'] = listToTuples
    return df

#### Unigrams

In [192]:
maximum = 1/10 * 10100
minimum = 1/100 * 10100
windowsize = 3
num_neighbors = 10

music_unigram_glossary = get_glossary('data/processed/source_music.json', 'sentiment_unigrams', 
                                     maximum, minimum, windowsize, num_neighbors)

13742 vocabs before
900 vocabs after
There are 63567 word-context pairs


In [225]:
maximum = 1/10 * 10100
minimum = 1/200 * 10100
windowsize = 3
num_neighbors = 10

book_unigram_glossary = get_glossary('data/processed/source_B.json', 'sentiment_unigrams', 
                                     maximum, minimum, windowsize, num_neighbors)
electronics_unigram_glossary = get_glossary('data/processed/source_E.json', 'sentiment_unigrams', 
                                maximum, minimum, windowsize, num_neighbors)
pet_unigram_glossary = get_glossary('data/processed/source_P.json', 'sentiment_unigrams', 
                                maximum, minimum, windowsize, num_neighbors)

45774 vocabs before
1278 vocabs after
There are 262930 word-context pairs
33371 vocabs before
1060 vocabs after
There are 210311 word-context pairs
28364 vocabs before
835 vocabs after
There are 144368 word-context pairs


In [228]:
maximum = 1/10 * 14100
minimum = 1/200 * 14100
windowsize = 3
num_neighbors = 10

EP_unigram_glossary = get_glossary('data/processed/source_EP.json', 'sentiment_unigrams', 
                                 maximum, minimum, windowsize, num_neighbors)
BP_unigram_glossary = get_glossary('data/processed/source_BP.json', 'sentiment_unigrams', 
                                 maximum, minimum, windowsize, num_neighbors)
BE_unigram_glossary = get_glossary('data/processed/source_BE.json', 'sentiment_unigrams', 
                                 maximum, minimum, windowsize, num_neighbors)

37323 vocabs before
938 vocabs after
There are 239543 word-context pairs
46882 vocabs before
1088 vocabs after
There are 279832 word-context pairs
49689 vocabs before
1238 vocabs after
There are 352390 word-context pairs


In [230]:
maximum = 1/10 * 18100
minimum = 1/200 * 18100
windowsize = 3
num_neighbors = 10
multisource_unigram_glossary = get_glossary('data/processed/source_BEP.json', 'sentiment_unigrams', 
                                            maximum, minimum, windowsize, num_neighbors)

52819 vocabs before
1197 vocabs after
There are 408134 word-context pairs


#### Bigrams

In [207]:
maximum = 1/10 * 2100
minimum = 1/500 * 2100
windowsize = 3
num_neighbors = 10

music_bigram_glossary = get_glossary('data/processed/source_music.json', 'sentiment_bigrams', 
                                     maximum, minimum, windowsize, num_neighbors)

48406 vocabs before
400 vocabs after
There are 1130 word-context pairs


In [232]:
maximum = 1/10 * 10100
minimum = 1/1000 * 10100
windowsize = 3
num_neighbors = 10

books_bigram_glossary = get_glossary('data/processed/source_B.json', 'sentiment_bigrams', 
                                     maximum, minimum, windowsize, num_neighbors)
electronics_bigram_glossary = get_glossary('data/processed/source_E.json', 'sentiment_bigrams', 
                                maximum, minimum, windowsize, num_neighbors)
pet_bigram_glossary = get_glossary('data/processed/source_P.json', 'sentiment_bigrams', 
                                maximum, minimum, windowsize, num_neighbors)

307493 vocabs before
1274 vocabs after
There are 15869 word-context pairs
234342 vocabs before
930 vocabs after
There are 8922 word-context pairs
186085 vocabs before
783 vocabs after
There are 6820 word-context pairs


In [233]:
maximum = 1/10 * 14100
minimum = 1/1000 * 14100
windowsize = 3
num_neighbors = 10

BE_bigram_glossary = get_glossary('data/processed/source_BE.json', 'sentiment_bigrams', 
                                            maximum, minimum, windowsize, num_neighbors)
BP_bigram_glossary = get_glossary('data/processed/source_BP.json', 'sentiment_bigrams', 
                                            maximum, minimum, windowsize, num_neighbors)
EP_bigram_glossary = get_glossary('data/processed/source_BP.json', 'sentiment_bigrams', 
                                            maximum, minimum, windowsize, num_neighbors)

392047 vocabs before
1103 vocabs after
There are 15841 word-context pairs
353230 vocabs before
970 vocabs after
There are 13001 word-context pairs
353230 vocabs before
970 vocabs after
There are 13001 word-context pairs


In [240]:
maximum = 1/10 * 18100
minimum = 1/600 * 18100
windowsize = 3
num_neighbors = 10

multisource_bigram_glossary = get_glossary('data/processed/source_BEP.json', 'sentiment_bigrams', 
                                            maximum, minimum, windowsize, num_neighbors)

455668 vocabs before
492 vocabs after
There are 9214 word-context pairs


In [370]:
write_file('data/glossary/', 'book_unigram_glossary.json', book_unigram_glossary)
write_file('data/glossary/', 'electronics_unigram_glossary.json', electronics_unigram_glossary)
write_file('data/glossary/', 'pet_unigram_glossary.json', pet_unigram_glossary)
write_file('data/glossary/', 'EP_unigram_glossary.json', EP_unigram_glossary)
write_file('data/glossary/', 'BE_unigram_glossary.json', BE_unigram_glossary)
write_file('data/glossary/', 'BP_unigram_glossary.json', BP_unigram_glossary)
write_file('data/glossary/', 'multisource_unigram_glossary.json', multisource_unigram_glossary)

write_file('data/glossary/', 'books_bigram_glossary.json', books_bigram_glossary)
write_file('data/glossary/', 'electronics_bigram_glossary.json', electronics_bigram_glossary)
write_file('data/glossary/', 'pet_bigram_glossary.json', pet_bigram_glossary)
write_file('data/glossary/', 'EP_bigram_glossary.json', EP_bigram_glossary)
write_file('data/glossary/', 'BE_bigram_glossary.json', BE_bigram_glossary)
write_file('data/glossary/', 'BP_bigram_glossary.json', BP_bigram_glossary)
write_file('data/glossary/', 'multisource_bigram_glossary.json', multisource_bigram_glossary)


In [371]:
write_file('data/glossary/', 'music_unigram_glossary.json', music_unigram_glossary)
write_file('data/glossary/', 'music_bigram_glossary.json', music_bigram_glossary)

---------

### Test 1

In [367]:
maximum = 1/10 * 18100
minimum = [1000, 905, 603, 200, 45, 15,  6]
windowsize = 3
num_neighbors = 10
for i in minimum: 
    multisource_unigram_glossary_test = get_glossary('data/processed/source_BEP.json', 'sentiment_unigrams', 
                                            maximum, i, windowsize, num_neighbors)
    write_file('test/', 'multisource_unigram_glossary_test{}.json'.format(i), multisource_unigram_glossary_test)
   

53573 vocabs before
45 vocabs after
There are 1981 word-context pairs
53573 vocabs before
59 vocabs after
There are 3427 word-context pairs
53573 vocabs before
115 vocabs after
There are 12460 word-context pairs
53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
53573 vocabs before
2269 vocabs after
There are 700386 word-context pairs
53573 vocabs before
5502 vocabs after
There are 1169511 word-context pairs
53573 vocabs before
11060 vocabs after
There are 1530815 word-context pairs


---------------

### Test 2 

In [380]:
maximum = 1/10 * 18100
minimum  = 200
windowsize = 3
neigh_size = [1,10, 20, 50, 100, 500, 1000]
for i in neigh_size: 
    multisource_unigram_glossary_anothertest = get_glossary('data/processed/source_BEP.json', 'sentiment_unigrams', 
                                            maximum, minimum, windowsize, i)
    write_file('test/', 'multisource_unigram_glossary_anothertest{}.json'.format(i), multisource_unigram_glossary_anothertest)
    

53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
53573 vocabs before
530 vocabs after
There are 163700 word-context pairs
