## Working with nltk corpuses

### Note# Model is already attached in the zip file... directly go to loading the model part of the code to upload the model

In [None]:
# imports
from nltk.corpus import reuters
import gensim
import re
from nltk import tokenize
import itertools
import sys
import numpy as np
import scipy
import os
import pandas as pd

In [None]:
# id's of the documents
#reuters.fileids()

In [None]:
# total number of documents
len(reuters.fileids())

In [None]:
# total list of documents
documents = reuters.fileids()
# To find list of training documents
train_doc = list(filter(lambda doc: doc.startswith("train"), documents))
# To find list of testing documents
test_doc = list(filter(lambda doc: doc.startswith("test"), documents))

In [None]:
# to get the actual text
reuters.raw(train_doc[0])

In [None]:
# to get the different categories the documents belongs
reuters.categories()
# we can also find documents that belong to a particular category
reuters.categories('acq')

### Word2Vec Model

The input should be a iist of sentences, with each sentence as list of tokens

In [None]:
# Creating the input for word2Vec Model
# converting all documents in a list of sentences
list_allsentences = []
for doc_id in range(len(train_doc)):
    sentence_indoc = tokenize.sent_tokenize(reuters.raw(train_doc[doc_id])) # spliting each document in sentences
    #print(len(sentence_indoc))
    for sent in sentence_indoc:
        list_ofwords = sent.replace('\n ','').split()
        '''code here if preprocessing is required example'''
        # -->
        l=[]
        for word in list_ofwords:
            word = word.lower()
            word = re.sub('[,.]','',word)
            word = re.sub('[\d]+','NUM',word) # converting all digits to num
            
        #    if word not in stopwords:  # removing stopwords
            l.append(word)
        list_allsentences.append(l)
        #list_allsentences.append(list_ofwords)
print(len(list_allsentences))

In [None]:
# Checking the size of the input
sys.getsizeof(list_allsentences)

### Creating the models for Wikipedia scraps

In [None]:
# reading the data
data = []
path = './Wikipedia_Scrap/'
files = [f for f in os.listdir(path) if f.endswith(".txt")]
for f in files:
    with open(os.path.join(path, f)) as myfile:
        data.append(myfile.read())
wiki_df = pd.DataFrame(data)

In [None]:
# renaming the columns
wiki_df.columns = ['Articles']

In [None]:
# testing
tokenize.sent_tokenize(wiki_df['Articles'][0])[0].split()

In [None]:
# bringing the wiki dataset in input format for word2vec
list_wikisentences = []
for doc_id in range(len(wiki_df)):
    docu =  wiki_df['Articles'][doc_id]
    sent_indoc =  tokenize.sent_tokenize(docu)
    
    for sent in sent_indoc:
        list_ofwords = sent.replace('\n','').split()
        
        l=[]
        for word in list_ofwords:
            word=word.lower()
            word=re.sub('[,.=?-]','',word)
            word = re.sub('[\d]+','NUM', word)
            
            l.append(word)
        list_wikisentences.append(l)
print(len(list_wikisentences))

In [None]:
# Checking the size of the input
sys.getsizeof(list_wikisentences)

In [None]:
# joining the two set of sentences to create a final training set
training_list =list_allsentences + list_wikisentences
len(training_list)

#### Word2Vec model with CBOW

In [None]:
model =  gensim.models.Word2Vec(training_list, min_count=10, size=100)
# size: is the number of topic/ neurons in the training layer
# window: tells the number of words to look in the context
# alpha: is the initial learning rate

#### Word2Vec with Skipgram model

In [None]:
model_sg = gensim.models.Word2Vec(training_list, min_count=10, size=100, sg=1)

#### Saving the model

In [None]:
# saving the model
# saving the model
model.save('/home/prateek/uva/CS-6501 Text Mining/Workspace/Word2Vec/mymodel1_cbow')
model_sg.save('/home/prateek/uva/CS-6501 Text Mining/Workspace/Word2Vec/mymodel1_sg')

### Loading the model for future use

In [None]:
# The CBOW model
cbow_model = gensim.models.Word2Vec.load('/home/prateek/uva/CS-6501 Text Mining/Workspace/Word2Vec/mymodel1_cbow')

In [None]:
# The Skip gram model
sg_model = gensim.models.Word2Vec.load('/home/prateek/uva/CS-6501 Text Mining/Workspace/Word2Vec/mymodel1_sg')

### Calculating the sentence similarity

#### Function to create vector representation of a sentence 

In [None]:
# inputs:
# words: it is the sentence as a list of tokens
# model : skipgram or cbow word2vec model created/loaded above
# num_features: Number f neurons in the training layer( which makes the final features)
#index2words; list containing names of words in the vocabulary
def avg_feature_vector(words, model, num_features, index2word_set):
        #function to average all words vectors in a given paragraph
        featureVec = np.zeros((num_features,), dtype="float32") # np.zeros; returns a new array with given 
                                                                # dimension(Num of neurons in training layers) 
                                                                # filled with zero
        nwords = 0
        #index2word_set = set(model.index2word) as set performs better than a list
        for word in words:
            if word in index2word_set:
                nwords = nwords+1
                featureVec = np.add(featureVec, model[word]) # creating the new vector representation

        if(nwords>0):
            featureVec = np.divide(featureVec, nwords) # normalising the sentences
        return featureVec

#### example

In [None]:
w = "thank you for smoking".split()
w1 = "the quick brown fox jumped over the lazy dog".split()
w_feat = avg_feature_vector(w, model_sg, 100, set(model.index2word))
w1_feat = avg_feature_vector(w1, model_sg, 100, set(model.index2word))

#### calculating the cosine similarity usin scipy implementation

In [None]:
sen1_sen2_similarity =  1 - scipy.spatial.distance.cosine(w1_feat, w_feat)
sen1_sen2_similarity

#### ---------------------------- Rough Work ------------------------------------------

In [None]:
# def get_cosine(vec1, vec2):
#      intersection = set(vec1.keys()) & set(vec2.keys())
#      numerator = sum([vec1[x] * vec2[x] for x in intersection])

#      sum1 = sum([vec1[x]**2 for x in vec1.keys()])
#      sum2 = sum([vec2[x]**2 for x in vec2.keys()])
#      denominator = math.sqrt(sum1) * math.sqrt(sum2)

#      if not denominator:
#         return 0.0
#      else:
#         return float(numerator) / denominator

In [None]:
 # #get average vector for sentence 1
# sentence_1 = "this is sentence number one"
# sentence_1_avg_vector = avg_feature_vector(sentence_1.split(), model=word2vec_model, num_features=300)

# #get average vector for sentence 2
# sentence_2 = "this is sentence number two"
# sentence_2_avg_vector = avg_feature_vector(sentence_2.split(), model=word2vec_model, num_features=300)



In [None]:
# the following code was pcked from web,(I don't remember the link)
# def collection_stats():
#     # List of documents
#     documents = reuters.fileids()
#     print(str(len(documents)) + " documents")
 
#     train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
#     print(str(len(train_docs)) + " total train documents")
 
#     test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
#     print(str(len(test_docs)) + " total test documents")
 
#     # List of categories
#     categories = reuters.categories()
#     print(str(len(categories)) + " categories")
 
#     # Documents in a category
#     category_docs = reuters.fileids("acq")
 
#     # Words for a document
#     document_id = category_docs[0]
#     document_words = reuters.words(category_docs[0])
#     print(document_words);  
 
#     # Raw document
#     print(reuters.raw(document_id))

In [None]:
# # breaking paragraph into sentences
# # 1st implementation using nltk
# print(tokenize.sent_tokenize(reuters.raw(train_doc[0]))) #document is broken into sentences
# print(tokenize.sent_tokenize(reuters.raw(train_doc[0]))[0].replace('\n ','').split()) # sentences are further broken
# # into list of words

In [None]:
# # 2nd implementation, using itertools
# # taken from 'http://stackoverflow.com/questions/9474395/how-to-break-up-a-paragraph-by-sentences-in-python'
# def get_first_n_sentence(text, n):
#     endsentence = ".?!"
#     sentences = itertools.groupby(text, lambda x: any(x.endswith(punct) for punct in endsentence))
#     for number,(truth, sentence) in enumerate(sentences):
#         if truth:
#             first_n_sentences = previous+''.join(sentence).replace('\n',' ')
#         previous = ''.join(sentence)
#         if number>=2*n: break #

#     return first_n_sentences

In [None]:
# checking
# print(get_first_n_sentence(reuters.raw(train_doc[0]), 1).replace('\n ',''))
# print(get_first_n_sentence(reuters.raw(train_doc[0]), 1).replace('\n ','').split())

### Creating the input for word2vec

In [None]:
# # checking
# # printing a list of sentences for a document
# print(len(train_doc)) # total training documents
# print(len(tokenize.sent_tokenize(reuters.raw(train_doc[0])))) # number of sentences in one document
# for num in range(len(tokenize.sent_tokenize(reuters.raw(train_doc[0])))):
#     print(tokenize.sent_tokenize(reuters.raw(train_doc[0]))[num].replace('\n ','').split()) # each sentence broken into
#     # list of words

In [None]:
# # Preprocessing
# import pandas as pd
# # stopwords list, we can use any available stopwords list 
# swords = pd.read_csv('english.stop.txt', sep='\n', header=None)
# stopwords = set(list(swords[0]))
# l = []
# for word in tokenize.sent_tokenize(reuters.raw(train_doc[0]))[2].replace('\n ','').split():
#     word = re.sub('[,.]','',word)
#     word = re.sub('[\d]+','NUM',word) # converting all digits to num
#     if word not in stopwords:  # removing stopwords
#         l.append(word)  
# print(tokenize.sent_tokenize(reuters.raw(train_doc[0]))[2].replace('\n ','').split())
# print(l)            

In [None]:
#len(train_doc)

In [None]:
# # converting all documents in a list of sentences
# list_allsentences = []
# for doc_id in range(len(train_doc)):
#     sentence_indoc = tokenize.sent_tokenize(reuters.raw(train_doc[doc_id])) # spliting each document in sentences
#     #print(len(sentence_indoc))
#     for sent in sentence_indoc:
#         list_ofwords = sent.replace('\n ','').split()
#         '''code here if preprocessing is required example'''
#         # -->
#         l=[]
#         for word in list_ofwords:
#             word = word.lower()
#             word = re.sub('[,.]','',word)
#             word = re.sub('[\d]+','NUM',word) # converting all digits to num
            
#         #    if word not in stopwords:  # removing stopwords
#             l.append(word)
#         list_allsentences.append(l)
#         #list_allsentences.append(list_ofwords)
# print(len(list_allsentences))

In [None]:
#import sys
#sys.getsizeof(list_allsentences)

In [None]:
#model =  gensim.models.Word2Vec(list_allsentences, min_count=2, size=100)
# size: is the number of topic/ neurons in the training layer
# window: tells the number of words to look in the context
# alpha: is the initial learning rate

In [None]:
#model.most_similar('week', topn=5)

In [None]:
# to estimate memory requirements
# model.estimate_memory()

In [None]:
# model_sg = gensim.models.Word2Vec(list_allsentences, min_count=2, size=100, sg=1)

In [None]:
#model['week']+model['ended']

In [None]:
# # saving the model
# model.save('/home/prateek/uva/CS-6501 Text Mining/Workspace/Word2Vec/mymodel1')
# model_sg.save('/home/prateek/uva/CS-6501 Text Mining/Workspace/Word2Vec/mymodel1_sg')
# # loading the model
# #new_model = gensim.models.Word2Vec.load('/home/prateek/uva/CS-6501 Text Mining/Workspace/Word2Vec/mymodel1')

#### Calculating different similaity

In [None]:
# # to calculate the cosine similarity between 2 terms [-1,1]
# model.similarity('week','ended')

# # to find the most similar terms, based on cosine similarity
# model.most_similar('week')
# # also,
# model.most_similar(positive=['woman', 'king'], negative=['man']) #eg.

# # find n most similar words
# model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) #eg.
# # gives multiple similar words, Find the top-N most similar words, using the multiplicative combination objective 

# # Compute cosine similarity between two sets of words.
# model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) #eg.

In [None]:
# # to get the vexctor representation of a word; the vectors are numpy arrays
# model['term']
# # to access all terms in the vocabulary
# vocab = list(model.vocab.keys())
# vocab[:10] # first 10 words

In [None]:
# # to find odd one out
# model.doesnt_match("breakfast cereal dinner lunch".split()) #eg

In [None]:
# # finding multiword phrases like "new york"
# bigram_transformer = gensim.models.Phrases(sentences)
# model = Word2Vec(bigram_transformer[sentences], size=100, ...) 

# # add sentences 
# bigram_transformer.add_vocab(new_sentence_stream)
# # or
# # for trigrams
# trigram = Phrases(bigram[sentence_stream])
# sent = [u'the', u'new', u'york', u'times', u'is', u'a', u'newspaper']
# print(trigram[bigram[sent]])
# [u'the', u'new_york_times', u'is', u'a', u'newspaper']

In [None]:
# print(tokenize.sent_tokenize(reuters.raw(test_doc[0]))[0])