In [11]:
import nltk
from nltk.stem import PorterStemmer
from textblob import TextBlob
from nltk.corpus import wordnet as w,stopwords 
import json
import numpy as np
import os
import re
import gensim
import enchant
import random
from nltk.tokenize import RegexpTokenizer
from collections import Counter

In [5]:
#initialising dictionary, tokenizer, stemmer
dictionary = enchant.Dict("en_US")
tokenizer=RegexpTokenizer('[a-zA-Z]+')
ps=PorterStemmer()
all_stopwords=stopwords.words('english')+['ism','amazon']

In [6]:
#loading word2vec model
model=gensim.models.Word2Vec.load('../Word2Vec_model/model')

In [6]:
model.most_similar('salary')

[(u'compensation', 0.8923154473304749),
 (u'hike', 0.7944278717041016),
 (u'pay', 0.783841609954834),
 (u'wages', 0.7133928537368774),
 (u'hikes', 0.7015707492828369),
 (u'salaries', 0.6739672422409058),
 (u'wage', 0.6642293930053711),
 (u'remuneration', 0.6438825726509094),
 (u'increment', 0.6369696855545044),
 (u'raises', 0.6115041971206665)]

In [75]:
# storing similar words
aspect_pool=[]
for aspect_name in files:
    aspect_pool=list(aspects[aspect_name])
    for aspect in aspects[aspect_name]:
        extra_aspect=[word for word,polarity in model.most_similar(aspect)]
        aspect_pool.extend(extra_aspect)
    aspect_pool=list(set(aspect_pool))
    #with open('../Aspects/'+aspect_name,'w') as f:
    #    json.dump(aspect_pool,f)

In [7]:
#fetching aspects
aspects={}
for root, dirs, files in os.walk('../Aspects/'):
        for name in files:
            with open('../Aspects/'+name) as f:
                aspects[name]=json.load(f) 

In [8]:
#fetch all reviews
data=[]
for root, dirs, files_review in os.walk('../Reviews/'):
    for name in dirs:                             #all review folders
        with open('../Reviews/'+name+'/review.txt') as f:
            data=data+json.load(f)
            f.close()
dataset=[]
dataset=np.asarray(data)
dataset=np.concatenate(dataset)               #contains all the reviews    #converted to a single long array

In [9]:
#total reviews
dataset.shape

(86989,)

In [21]:
def analyse_aspect(tokens):            # analyses to which aspect a set of tokens of a sentence belong to
    aspect_flags={}
    for aspect_name in files:
        aspect_flags[aspect_name]=0
    for token in tokens:
        #temp_token=ps.stem(token)
        temp_token=token        
        for aspect_name in files:
            for aspect in aspects[aspect_name]:
                temp_aspect=ps.stem(aspect)
                if(temp_token == temp_aspect):
                    aspect_flags[aspect_name]=aspect_flags[aspect_name]+1
    return aspect_flags

In [22]:
def analyse_sentiment(aspect_flags,sentence):    #analyses the sentiment and stores the sentence to the specified class
    temp=TextBlob(sentence)
    polarity=temp.sentiment[0]            
    if(polarity>=-1 and polarity<=-0.5):
        file_name='neg'
    elif(polarity>-0.5 and polarity<=-0.1):
        file_name='sli_neg'
    elif(polarity>0.1 and polarity<=0.5):
        file_name='sli_pos'
    elif(polarity>0.5 and polarity<=1):
        file_name='pos' 
    else :
        file_name='neutral'
    #print 'sentence :'+ sentence
    #print 'polarity :'+ file_name
    
    gen_flag=0
    for aspect_name in files:                 # for all aspect names
        if(aspect_flags[aspect_name]>0):
            #print 'aspect name :' +aspect_name
            gen_flag=1             # for general category
            # load data, append sentence and store it in file
            file_ptr=open('../Dataset_new/'+aspect_name+'/'+file_name,'r')     
            temp_data=json.load(file_ptr)
            file_ptr.close()
            file_ptr=open('../Dataset_new/'+aspect_name+'/'+file_name,'w')
            temp_data.append(sentence)
            json.dump(temp_data,file_ptr)
            file_ptr.close() 
    
    if(gen_flag==0):
        #print 'Aspect Name : general category'
        file_ptr=open('../Dataset_new/Aspect7/'+file_name,'r')
        temp_data=json.load(file_ptr)
        file_ptr.close()
        file_ptr=open('../Dataset_new/Aspect7/'+file_name,'w')
        temp_data.append(sentence)
        json.dump(temp_data,file_ptr)
        file_ptr.close()     

In [32]:
def preprocess(sentence):
    dataset=[]
    #print 'sentence : '+sentence
    temp=tokenizer.tokenize(sentence)
    #print 'tokenizer : '+str(temp)
    remove_stopword=[word for word in temp if word not in all_stopwords]    # removing stopwords
    #print 'stopwords : '+ str(remove_stopword)
    only_english=[word for word in remove_stopword if dictionary.check(word)== True]  # remove non-english words
    #print 'removed all non-english words : '+str(only_english)
    return  ' '.join(only_english)  # returns a sentence

In [8]:
#only for displaying
sample=preprocess('this is a  good company having great infrastructure')
print 'Final Sentence : '+str(sample)
print ' '
sample_aspect_flag=analyse_aspect(sample.split())
print 'Aspects : '+str(sample_aspect_flag)
print ' '
sample_tb=TextBlob(sample)
print 'Sentiment Value : '+str(sample_tb.sentiment)

sentence : this is a  good company having great infrastructure
tokenizer : ['this', 'is', 'a', 'good', 'company', 'having', 'great', 'infrastructure']
stopwords : ['good', 'company', 'great', 'infrastructure']
removed all non-english words : ['good', u'compani', 'great', u'infrastructur']
Final Sentence : good compani great infrastructur
 
Aspects : {'Aspect6': 0, 'Aspect5': 1, 'Aspect4': 0, 'Aspect3': 0, 'Aspect2': 0, 'Aspect1': 0}
 
Sentiment Value : Sentiment(polarity=0.75, subjectivity=0.675)


In [14]:
def tokenize_sent(sentence):                         # for tokenizing single sentence
    sentence=preprocess(sentence)
    if len(sentence.split())>1:
        aspect_flags=analyse_aspect(sentence.split())
        analyse_sentiment(aspect_flags,sentence)

In [15]:
def tokenize_review(review):                # for tokenizing single reviews
    review=review.lower()
    review=review.replace('.',',')        
    sentences=re.split(' and | but |, ',review)        #splitting criteria
    for sentence in sentences:
        tokenize_sent(sentence)

In [34]:
def tokenize_all(dataset):               # for tokenizing all the reviews
    count=Counter();
    for review in dataset:
        count['c'] +=1
        print count['c']
        tokenize_review(review)

In [17]:
#default encoding to utf8 
import sys
sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding('utf8')

In [36]:
random.shuffle(dataset)

In [None]:
tokenize_all(dataset[:25000])    #executed as batches

In [None]:
tokenize_all(dataset[25000:50000])    #executed as batches

In [30]:
#clear all files
def clear_all():
    temp_files=list(files)
    temp_files.append('Aspect7')
    for file_name in temp_files:
        with open('../Dataset/'+file_name+'/pos','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/neg','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/sli_pos','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/sli_neg','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/neutral','w') as f:
            json.dump([],f)        

In [31]:
#clear_all()

In [7]:
def test_all():                                     #to test whether all files are in json format 
    temp_files=list(files)
    temp_files.append('Aspect7')
    for file_name in temp_files:
        print file_name
        with open('../Dataset/'+file_name+'/pos') as f:
            json.load(f)
        print 'pos  success'    
        with open('../Dataset/'+file_name+'/neg') as f:
            json.load(f)
        print 'neg  success'    
        with open('../Dataset/'+file_name+'/sli_pos') as f:
            json.load(f)
        print 'sli_pos  success'    
        with open('../Dataset/'+file_name+'/sli_neg') as f:
            json.load(f)
        print 'sli_neg  success'    
        with open('../Dataset/'+file_name+'/neutral') as f:
            json.load(f)   
        print 'neutral  success'    

In [57]:
#test_all()
