In [73]:
import nltk
from nltk.stem import PorterStemmer
from textblob import TextBlob
from nltk.corpus import wordnet as w,stopwords 
import json
import numpy as np
import os
import re
import gensim
from collections import Counter
from nltk.corpus import stopwords
import enchant
import pickle
from nltk.tokenize import RegexpTokenizer

In [3]:
#loading word2vec model
model=gensim.models.Word2Vec.load('../Word2Vec_model/model')

In [7]:
#initialising dictionary, tokenizer, stemmer
dictionary = enchant.Dict("en_US")
tokenizer=RegexpTokenizer('[a-zA-Z]+')
ps=PorterStemmer()
all_stopwords=stopwords.words('english')+[]

In [36]:
def preprocess(data):
    dataset=[]
    for sentence in data:
        #print 'sentence'
        #print sentence
        temp=tokenizer.tokenize(sentence)
        #print 'tokenizer'
        #print temp
        remove_stopword=[word for word in temp if word not in all_stopwords]    # removing stopwords
        #print 'stopwords'
        #print remove_stopword
        #only_english=[ps.stem(word) for word in remove_stopword if dictionary.check(word)== True]  # remove non-english words
        only_english=[ps.stem(word) for word in remove_stopword if dictionary.check(word)== True]  # remove non-english words
        #print 'removed all non-english words'
        #print only_english
        if(len(only_english)>1):                  #appends only if length of list >1
            dataset.append(' '.join(only_english))
    return dataset    

In [89]:
#loading and cleaning dataset
def preprocessing(company_name):
    dataset=[]
    sentences=[]
    with open('TestReview/'+company_name+'/review.txt') as f:
        reviews=json.load(f)
    reviews=np.asarray(reviews)
    reviews=np.concatenate(reviews)        #converting it to single list from list of lists
    #print 'Reviews'
    #print reviews[:10]
    for review in reviews:
        review=review.lower()             #converting reviews into lowercase
        review=review.replace('. ',';')                            
        sentences.extend(re.split(' and | but |; | ; |.and ',review))       #splitting criteria
    #print 'sentences'
    #print sentences[:10]
    dataset=preprocess(sentences)
    with open('Dataset/'+company_name+'.txt','w') as f:
        json.dump(dataset,f)
    return dataset

In [66]:
#only for printing
dataset=preprocessing('Samsung')

Reviews
[ u' Long working hours and will get very little time for family. Not very process oriented but only end result oriented. They are very competitive and want results at any cost '
 u" Mostly it's not so hard but it's too. Boring..And if any buyer than it will be a more  boring ..And even we need to spend our own money ...While just like every job it so. Boring at the beginning..And it become habit...And most important the processing is too slow....... "
 u' Good company but turning a little biased towards employees. More focus on hiring IIT grads and also creating huge salary gap between employees. HR policies have improved other than salary normalization. '
 u' i like electronics work technician then communication skills project work like an embedded and pcb designing networking laptop technician i am fresher searching for job '
 u' the work there was a lot of fun and and environment really fast paced . we learned a lot of sales tactics in the environment. the target were easy 

In [67]:
#final dataset 
print dataset[:10]

[u'long work hour', u'get littl time famili process orient', u'end result orient competit', u'want result cost', u'mostli hard', u'buyer bore', u'even need spend money like everi job bore begin', u'becom habit', u'import process slow', u'good compani']


In [77]:
#loading vocab
with open('../NaiveBayes/vocab_unigram') as f:
    vocab=json.load(f)
def extract_features(document):       #features are bag of words. document is a list of words of a sentence 
    features = {}
    for word in vocab:
        features['contains(%s)' % word] = (word in document)
    return features     

In [None]:
def analyse_aspect(tokens):            # analyses to which aspect a set of tokens of a sentence belong to
    aspect_flags={}
    for aspect_name in files:
        aspect_flags[aspect_name]=0
    for token in tokens:
        temp_token=ps.stem(token)
        for aspect_name in files:
            for aspect in aspects[aspect_name]:
                temp_aspect=ps.stem(aspect)
                if(temp_token == temp_aspect):
                    aspect_flags[aspect_name]=aspect_flags[aspect_name]+1
    return aspect_flags

In [93]:
#naiveBayes
def naiveBayes_unigram(company_name):
    #loading naivebayes classifier
    with open('../NaiveBayes/naive_bayes_unigram_model') as f:
        classifier=pickle.load(f)
    dataset=preprocessing(company_name)
    for sentence in dataset:
        print 'sentence->  '+sentence
        print 'Polarity= '+classifier.classify(extract_features(sentence.split()))
        dist = classifier.prob_classify(extract_features(sentence.split()))
        polarity=0
        flag=0
        for label in dist.samples():
            print("%s: %f" % (label, dist.prob(label)))
            if(polarity<dist.prob(label)):
                polarity=dist.prob(label)
                flag=label
        if(flag=='neutral'):
            polarity=0
        elif(flag=='pos'):
            polarity=+0.5+(polarity/2)
        elif(flag=='neg'):
            polarity=-0.5-(polarity/2)    
        elif(flag=='sli_pos'):
            polarity=polarity/2
        elif(flag=='sli_neg'):
            polarity=-(polarity/2)   
        print 'final polarity='+ str(polarity)
        print ' '
        
        

In [94]:
#only for printing
naiveBayes_unigram('Samsung')

sentence->  long work hour
Polarity= sli_neg
neg: 0.007710
sli_pos: 0.116827
sli_neg: 0.528854
neutral: 0.302411
pos: 0.044198
final polarity=-0.26442699595
 
sentence->  get littl time famili process orient
Polarity= neutral
neg: 0.002052
sli_pos: 0.211779
sli_neg: 0.305912
neutral: 0.480105
pos: 0.000151
final polarity=0
 
sentence->  end result orient competit
Polarity= neutral
neg: 0.001754
sli_pos: 0.421355
sli_neg: 0.098114
neutral: 0.421403
pos: 0.057374
final polarity=0
 
sentence->  want result cost
Polarity= sli_neg
neg: 0.069936
sli_pos: 0.226345
sli_neg: 0.381004
neutral: 0.236356
pos: 0.086359
final polarity=-0.19050206084
 
sentence->  mostli hard
Polarity= sli_neg
neg: 0.000470
sli_pos: 0.135271
sli_neg: 0.842939
neutral: 0.017992
pos: 0.003328
final polarity=-0.421469288537
 
sentence->  buyer bore
Polarity= neg
neg: 0.821084
sli_pos: 0.012063
sli_neg: 0.145867
neutral: 0.020185
pos: 0.000801
final polarity=-0.910541769869
 
sentence->  even need spend money like everi 