In [18]:
import nltk
from nltk.stem import PorterStemmer
from textblob import TextBlob
from nltk.corpus import wordnet as w,stopwords 
import json
import numpy as np
import os
import re
import gensim
from collections import Counter
from nltk.corpus import stopwords
import enchant
import pickle
from nltk.tokenize import RegexpTokenizer

In [19]:
#loading word2vec model
model=gensim.models.Word2Vec.load('../Word2Vec_model/model')

In [20]:
#initialising dictionary, tokenizer, stemmer
dictionary = enchant.Dict("en_US")
tokenizer=RegexpTokenizer('[a-zA-Z]+')
ps=PorterStemmer()
all_stopwords=stopwords.words('english')+[]

In [21]:
def preprocess(data):
    dataset=[]
    for sentence in data:
        #print 'sentence'
        #print sentence
        temp=tokenizer.tokenize(sentence)
        #print 'tokenizer'
        #print temp
        remove_stopword=[word for word in temp if word not in all_stopwords]    # removing stopwords
        #print 'stopwords'
        #print remove_stopword
        #only_english=[ps.stem(word) for word in remove_stopword if dictionary.check(word)== True]  # remove non-english words
        only_english=[ps.stem(word) for word in remove_stopword if dictionary.check(word)== True]  # remove non-english words
        #print 'removed all non-english words'
        #print only_english
        if(len(only_english)>1):                  #appends only if length of list >1
            dataset.append(' '.join(only_english))
    return dataset    

In [22]:
#loading and cleaning dataset
def preprocessing(company_name):
    dataset=[]
    sentences=[]
    with open('TestReview/'+company_name+'/review.txt') as f:
        reviews=json.load(f)
    reviews=np.asarray(reviews)
    reviews=np.concatenate(reviews)        #converting it to single list from list of lists
    #print 'Reviews'
    #print reviews[:10]
    for review in reviews:
        review=review.lower()             #converting reviews into lowercase
        review=review.replace('. ',';')                            
        sentences.extend(re.split(' and | but |; | ; |.and ',review))       #splitting criteria
    #print 'sentences'
    #print sentences[:10]
    dataset=preprocess(sentences)
    with open('Dataset/'+company_name+'.txt','w') as f:
        json.dump(dataset,f)
    return dataset

In [23]:
#only for printing
dataset=preprocessing('Samsung')

In [24]:
#final dataset 
print dataset[:10]

[u'long work hour', u'get littl time famili process orient', u'end result orient competit', u'want result cost', u'mostli hard', u'buyer bore', u'even need spend money like everi job bore begin', u'becom habit', u'import process slow', u'good compani']


In [25]:
#loading vocab for navebayes unigram
with open('../NaiveBayes/vocab_unigram') as f:
    vocab=json.load(f)
def extract_features_unigram_nb(document):       #features are bag of words. document is a list of words of a sentence 
    features = {}
    for word in vocab:
        features['contains(%s)' % word] = (word in document)
    return features     

In [26]:
#fetching aspects
aspects={}
for root, dirs, files in os.walk('../Aspects/'):
        for name in files:
            with open('../Aspects/'+name) as f:
                aspects[name]=json.load(f) 
                    
def analyse_aspect(tokens):            # analyses to which aspect a set of tokens of a sentence belong to
    count=0
    print 'Tokens'
    print tokens
    aspects_in_sentence=[]
    for token in tokens:
        temp_token=ps.stem(token)
        for aspect_name in files:                               #each aspect
            for aspect in aspects[aspect_name]:                 #each word in an aspect
                temp_aspect=ps.stem(aspect)
                if(temp_token == temp_aspect):
                    aspects_in_sentence.append(aspect_name)
                    count=1
                    break
    if(count==0):
        aspects_in_sentence.append('Aspect7')
    print 'List Of Aspects '
    print aspects_in_sentence
    print ' '
    return   aspects_in_sentence  

In [27]:
#aspect classification using naive bayes
def analyse_aspect_naivebayes(words):
    aspects_in_sentence=[]
    with open('../AspectNaiveBayesClassifier/naive_bayes_unigram_model') as f:
        classifier=pickle.load(f)
    aspects_in_sentence.append(classifier.classify(extract_features(words)))
    return aspects_in_sentence

In [54]:
#dataset
dataset=preprocessing('Samsung')

In [56]:
#naiveBayes
def naiveBayes_unigram(dataset,aspect_analysis):        #first find the polarity and then its aspect
    #loading naivebayes classifier
    with open('../NaiveBayes/naive_bayes_unigram_model') as f:
        classifier=pickle.load(f)    
    aspect_polarity=Counter()
    aspect_count=Counter()
    for sentence in dataset:
        print 'sentence->  '+sentence
        print 'Polarity= '+classifier.classify(extract_features_unigram_nb(sentence.split()))
        dist = classifier.prob_classify(extract_features_unigram_nb(sentence.split()))
        polarity=0
        flag=0
        for label in dist.samples():
            print("  %s: %f" % (label, dist.prob(label)))
            if(polarity<dist.prob(label)):
                polarity=dist.prob(label)
                flag=label
        if(flag=='neutral'):
            polarity=0
        elif(flag=='pos'):
            polarity=+0.5+(polarity/2)
        elif(flag=='neg'):
            polarity=-0.5-(polarity/2)    
        elif(flag=='sli_pos'):
            polarity=polarity/2
        elif(flag=='sli_neg'):
            polarity=-(polarity/2)   
        print 'final polarity='+ str(polarity)
        print ' '
        if('neutral'!= flag):
            if aspect_analysis == 'lexical':
                aspects_in_sentence=analyse_aspect(sentence.split())
            else:
                aspects_in_sentence=analyse_aspect_naivebayes(sentence.split())
            for asp in aspects_in_sentence:
                aspect_polarity[asp]=aspect_polarity[asp]+polarity
                aspect_count[asp]=aspect_count[asp]+1
            print 'aspect_polarity ='+str(aspect_polarity)
            print ' '
            print 'aspect_count ='+str(aspect_count)
        print '---------------------------'
    return aspect_polarity,aspect_count

In [60]:
# displaying the Result
def result(aspect_polarity,aspect_count):
    total_polarity=0
    for aspect_name in files:
        if(aspect_count[aspect_name]!=0):
            polarity=aspect_polarity[aspect_name]/aspect_count[aspect_name]
            total_polarity=total_polarity+polarity
        else:
            polarity=0
        print aspects[aspect_name][0]+':  '+str(polarity)
    if(aspect_count['Aspect7']!=0):
        polarity=aspect_polarity['Aspect7']/aspect_count['Aspect7']
    print 'General:  '+str(polarity)
    total_polarity=total_polarity+polarity    
    print ' '    
    total_polarity=total_polarity/7
    print 'Overall Polarity  '+str(total_polarity)    

In [None]:
#for testing
aspect_polarity,aspect_count=naiveBayes_unigram(dataset,'lexical')

sentence->  long work hour
Polarity= sli_neg
  neg: 0.007710
  sli_pos: 0.116827
  sli_neg: 0.528854
  neutral: 0.302411
  pos: 0.044198
final polarity=-0.26442699595
 
Tokens
[u'long', u'work', u'hour']
List Of Aspects 
['Aspect4']
 
aspect_polarity =Counter({'Aspect4': -0.26442699595034647})
 
aspect_count =Counter({'Aspect4': 1})
---------------------------
sentence->  get littl time famili process orient
Polarity= neutral
  neg: 0.002052
  sli_pos: 0.211779
  sli_neg: 0.305912
  neutral: 0.480105
  pos: 0.000151
final polarity=0
 
---------------------------
sentence->  end result orient competit
Polarity= neutral
  neg: 0.001754
  sli_pos: 0.421355
  sli_neg: 0.098114
  neutral: 0.421403
  pos: 0.057374
final polarity=0
 
---------------------------
sentence->  want result cost
Polarity= sli_neg
  neg: 0.069936
  sli_pos: 0.226345
  sli_neg: 0.381004
  neutral: 0.236356
  pos: 0.086359
final polarity=-0.19050206084
 
Tokens
[u'want', u'result', u'cost']
List Of Aspects 
['Aspect7'

In [61]:
#Result for naivebayes unigram
print 'Samsung'
result(aspect_polarity , aspect_count)

Samsung
learning:  0
life:  -0.26442699595
infrastructure:  0
management:  0
workplace:  0
salary:  0
General:  0
 
Overall Polarity  -0.0377752851358


In [None]:
#naivebayes unigram 