In [73]:
import nltk
from nltk.stem import PorterStemmer
from textblob import TextBlob
from nltk.corpus import wordnet as w,stopwords 
import json
import numpy as np
import os
import re
import gensim
from collections import Counter
from nltk.corpus import stopwords
import enchant
import pickle
from nltk.tokenize import RegexpTokenizer

In [3]:
#loading word2vec model
model=gensim.models.Word2Vec.load('../Word2Vec_model/model')

In [98]:
#initialising dictionary, tokenizer, stemmer
dictionary = enchant.Dict("en_US")
tokenizer=RegexpTokenizer('[a-zA-Z]+')
ps=PorterStemmer()
all_stopwords=stopwords.words('english')+[]

In [36]:
def preprocess(data):
    dataset=[]
    for sentence in data:
        #print 'sentence'
        #print sentence
        temp=tokenizer.tokenize(sentence)
        #print 'tokenizer'
        #print temp
        remove_stopword=[word for word in temp if word not in all_stopwords]    # removing stopwords
        #print 'stopwords'
        #print remove_stopword
        #only_english=[ps.stem(word) for word in remove_stopword if dictionary.check(word)== True]  # remove non-english words
        only_english=[ps.stem(word) for word in remove_stopword if dictionary.check(word)== True]  # remove non-english words
        #print 'removed all non-english words'
        #print only_english
        if(len(only_english)>1):                  #appends only if length of list >1
            dataset.append(' '.join(only_english))
    return dataset    

In [89]:
#loading and cleaning dataset
def preprocessing(company_name):
    dataset=[]
    sentences=[]
    with open('TestReview/'+company_name+'/review.txt') as f:
        reviews=json.load(f)
    reviews=np.asarray(reviews)
    reviews=np.concatenate(reviews)        #converting it to single list from list of lists
    #print 'Reviews'
    #print reviews[:10]
    for review in reviews:
        review=review.lower()             #converting reviews into lowercase
        review=review.replace('. ',';')                            
        sentences.extend(re.split(' and | but |; | ; |.and ',review))       #splitting criteria
    #print 'sentences'
    #print sentences[:10]
    dataset=preprocess(sentences)
    with open('Dataset/'+company_name+'.txt','w') as f:
        json.dump(dataset,f)
    return dataset

In [66]:
#only for printing
dataset=preprocessing('Samsung')

Reviews
[ u' Long working hours and will get very little time for family. Not very process oriented but only end result oriented. They are very competitive and want results at any cost '
 u" Mostly it's not so hard but it's too. Boring..And if any buyer than it will be a more  boring ..And even we need to spend our own money ...While just like every job it so. Boring at the beginning..And it become habit...And most important the processing is too slow....... "
 u' Good company but turning a little biased towards employees. More focus on hiring IIT grads and also creating huge salary gap between employees. HR policies have improved other than salary normalization. '
 u' i like electronics work technician then communication skills project work like an embedded and pcb designing networking laptop technician i am fresher searching for job '
 u' the work there was a lot of fun and and environment really fast paced . we learned a lot of sales tactics in the environment. the target were easy 

In [67]:
#final dataset 
print dataset[:10]

[u'long work hour', u'get littl time famili process orient', u'end result orient competit', u'want result cost', u'mostli hard', u'buyer bore', u'even need spend money like everi job bore begin', u'becom habit', u'import process slow', u'good compani']


In [77]:
#loading vocab
with open('../NaiveBayes/vocab_unigram') as f:
    vocab=json.load(f)
def extract_features(document):       #features are bag of words. document is a list of words of a sentence 
    features = {}
    for word in vocab:
        features['contains(%s)' % word] = (word in document)
    return features     

In [189]:
#fetching aspects
aspects={}
for root, dirs, files in os.walk('../Aspects/'):
        for name in files:
            with open('../Aspects/'+name) as f:
                aspects[name]=json.load(f) 
                    
def analyse_aspect(tokens,polarity):            # analyses to which aspect a set of tokens of a sentence belong to
    count=0
    print 'Tokens'
    print tokens
    aspects_in_sentence=[]
    for token in tokens:
        temp_token=ps.stem(token)
        for aspect_name in files:                               #each aspect
            for aspect in aspects[aspect_name]:                 #each word in an aspect
                temp_aspect=ps.stem(aspect)
                if(temp_token == temp_aspect):
                    aspects_in_sentence.append(aspect_name)
                    count=1
                    break
    if(count==0):
        aspects_in_sentence.append('Aspect7')
    print 'List Of Aspects '
    print aspects_in_sentence
    print ' '
    return   aspects_in_sentence  

In [190]:
#naiveBayes
def naiveBayes_unigram(company_name):
    #loading naivebayes classifier
    with open('../NaiveBayes/naive_bayes_unigram_model') as f:
        classifier=pickle.load(f)
    dataset=preprocessing(company_name)
    aspect_polarity=Counter()
    aspect_count=Counter()
    for sentence in dataset[:1000]:
        print 'sentence->  '+sentence
        print 'Polarity= '+classifier.classify(extract_features(sentence.split()))
        dist = classifier.prob_classify(extract_features(sentence.split()))
        polarity=0
        flag=0
        for label in dist.samples():
            print("  %s: %f" % (label, dist.prob(label)))
            if(polarity<dist.prob(label)):
                polarity=dist.prob(label)
                flag=label
        if(flag=='neutral'):
            polarity=0
        elif(flag=='pos'):
            polarity=+0.5+(polarity/2)
        elif(flag=='neg'):
            polarity=-0.5-(polarity/2)    
        elif(flag=='sli_pos'):
            polarity=polarity/2
        elif(flag=='sli_neg'):
            polarity=-(polarity/2)   
        print 'final polarity='+ str(polarity)
        print ' '
        if('neutral'!= flag):
            aspects_in_sentence=analyse_aspect(sentence.split(),polarity)
            for asp in aspects_in_sentence:
                aspect_polarity[asp]=aspect_polarity[asp]+polarity
                aspect_count[asp]=aspect_count[asp]+1
            print 'aspect_polarity'
            print aspect_polarity
            print ' '
            print 'aspect_count'
            print aspect_count
        print '---------------------------'

        return aspect_polarity,aspect_count

In [193]:
#for testing
aspect_polarity,aspect_count=naiveBayes_unigram('Samsung')

sentence->  long work hour
Polarity= sli_neg
  neg: 0.007710
  sli_pos: 0.116827
  sli_neg: 0.528854
  neutral: 0.302411
  pos: 0.044198
final polarity=-0.26442699595
 
Tokens
[u'long', u'work', u'hour']
List Of Aspects 
['Aspect4']
 
aspect_polarity
Counter({'Aspect4': -0.2644269959503445})
 
aspect_count
Counter({'Aspect4': 1})
---------------------------
sentence->  get littl time famili process orient
Polarity= neutral
  neg: 0.002052
  sli_pos: 0.211779
  sli_neg: 0.305912
  neutral: 0.480105
  pos: 0.000151
final polarity=0
 
---------------------------
sentence->  end result orient competit
Polarity= neutral
  neg: 0.001754
  sli_pos: 0.421355
  sli_neg: 0.098114
  neutral: 0.421403
  pos: 0.057374
final polarity=0
 
---------------------------
sentence->  want result cost
Polarity= sli_neg
  neg: 0.069936
  sli_pos: 0.226345
  sli_neg: 0.381004
  neutral: 0.236356
  pos: 0.086359
final polarity=-0.19050206084
 
Tokens
[u'want', u'result', u'cost']
List Of Aspects 
['Aspect7']
 

In [195]:
# displaying the Result
print 'Samsung'
total_polarity=0
for aspect_name in files:
    if(aspect_count[aspect_name]!=0):
        polarity=aspect_polarity[aspect_name]/aspect_count[aspect_name]
        total_polarity=total_polarity+polarity
    else:
        polarity=0
    print aspects[aspect_name][0]+':  '+str(polarity)
polarity=aspect_polarity['Aspect7']/aspect_count['Aspect7']
print 'General:  '+str(polarity)

total_polarity=total_polarity+polarity    
print ' '    
total_polarity=total_polarity/7
print 'Overall Polarity  '+str(total_polarity)    

Samsung
learning:  0.45197770307
life:  0.168283243961
infrastructure:  0.519018397696
management:  0.43923547081
workplace:  0.513536492892
salary:  0.375298010899
General:  0.283522230794
 
Overall Polarity  0.392981650018
