In [1]:
import nltk
from nltk.stem import PorterStemmer
from textblob import TextBlob
from nltk.corpus import wordnet as w,stopwords 
import json
import numpy as np
import os
import re
import gensim
from collections import Counter

In [2]:
#loading word2vec model
model=gensim.models.Word2Vec.load('../Word2Vec_model/model')

In [3]:
#fetching aspects
aspects={}
for root, dirs, files in os.walk('../Aspects/'):
        for name in files:
            with open('../Aspects/'+name) as f:
                aspects[name]=json.load(f) 

In [4]:
#initialising porter stemmer
ps=PorterStemmer()

In [5]:
#fetch all reviews
data=[]
for root, dirs, files_review in os.walk('../Reviews/'):
    for name in dirs:                             #all review folders
        with open('../Reviews/'+name+'/review.txt') as f:
            data=data+json.load(f)
            f.close()
dataset=[]
dataset=np.asarray(data)
dataset=np.concatenate(dataset)               #contains all the reviews

In [6]:
def remove_stopwords(tokens):                                # for removing stopwords
    stop=set(stopwords.words('english'))
    stopwords_tokens=[token for token in tokens if token not in stop]
    return stopwords_tokens

In [7]:
def analyse_aspect(tokens):            # analyses to which aspect a set of tokens of a sentence belong to
    aspect_flags={}
    for aspect_name in files:
        aspect_flags[aspect_name]=0
    for token in tokens:
        temp_token=ps.stem(token)
        for aspect_name in files:
            for aspect in aspects[aspect_name]:
                temp_aspect=ps.stem(aspect)
                if(temp_token == temp_aspect):
                    aspect_flags[aspect_name]=aspect_flags[aspect_name]+1
    return aspect_flags

In [8]:
def analyse_sentiment(aspect_flags,sentence):    #analyses the sentiment and stores the sentence to the specified class
    temp=TextBlob(sentence)
    polarity=temp.sentiment[0]            
    if(polarity>=-1 and polarity<=-0.5):
        file_name='neg'
    elif(polarity>-0.5 and polarity<=-0.1):
        file_name='sli_neg'
    elif(polarity>0.1 and polarity<=0.5):
        file_name='sli_pos'
    elif(polarity>0.5 and polarity<=1):
        file_name='pos' 
    else :
        file_name='neutral'
    #print 'sentence :'+ sentence
    #print 'polarity :'+ file_name
    
    gen_flag=0
    for aspect_name in files:                 # for all aspect names
        if(aspect_flags[aspect_name]>0):
            #print 'aspect name :' +aspect_name
            gen_flag=1             # for general category
            file_ptr=open('../Dataset/'+aspect_name+'/'+file_name,'r')
            temp_data=json.load(file_ptr)
            file_ptr.close()
            file_ptr=open('../Dataset/'+aspect_name+'/'+file_name,'w')
            temp_data.append(sentence)
            json.dump(temp_data,file_ptr)
            file_ptr.close() 
    
    if(gen_flag==0):
        #print 'Aspect Name : general category'
        file_ptr=open('../Dataset/Aspect7/'+file_name,'r')
        temp_data=json.load(file_ptr)
        file_ptr.close()
        file_ptr=open('../Dataset/Aspect7/'+file_name,'w')
        temp_data.append(sentence)
        json.dump(temp_data,file_ptr)
        file_ptr.close()     

In [9]:
def tokenize_sent(sentence):                         # for tokenizing single sentence
    temp_tokens=nltk.word_tokenize(sentence)
    sent_tokens=remove_stopwords(temp_tokens)         # to be reconsidered
    aspect_flags=analyse_aspect(sent_tokens)
    analyse_sentiment(aspect_flags,sentence)

In [10]:
def tokenize_review(review):                # for tokenizing single reviews
    review=review.lower()
    review=TextBlob(review)
    review=str(review.correct())           #to be corrected
    review=review.replace('.',',')        
    sentences=re.split(' and | but |, ',review)        #splitting criteria
    for sentence in sentences:
        tokenize_sent(sentence)

In [11]:
def tokenize_all(dataset):               # for tokenizing all the reviews
    count=Counter();
    for review in dataset:
        count['c'] +=1
        print count['c']
        tokenize_review(review)

In [15]:
#default encoding to utf8 
import sys
sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding('utf8')

In [None]:
tokenize_all(dataset[50005:75000])

In [19]:
#clear all files
def clear_all():
    temp_files=list(files)
    temp_files.append('Aspect7')
    for file_name in temp_files:
        with open('../Dataset/'+file_name+'/pos','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/neg','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/sli_pos','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/sli_neg','w') as f:
            json.dump([],f)
        with open('../Dataset/'+file_name+'/neutral','w') as f:
            json.dump([],f)        

In [22]:
#clear_all()

In [58]:
def test_all():                                     #to test whether all files are in json format 
    temp_files=list(files)
    temp_files.append('Aspect7')
    for file_name in temp_files:
        print file_name
        with open('../Dataset/'+file_name+'/pos') as f:
            json.load(f)
        print 'pos  success'    
        with open('../Dataset/'+file_name+'/neg') as f:
            json.load(f)
        print 'neg  success'    
        with open('../Dataset/'+file_name+'/sli_pos') as f:
            json.load(f)
        print 'sli_pos  success'    
        with open('../Dataset/'+file_name+'/sli_neg') as f:
            json.load(f)
        print 'sli_neg  success'    
        with open('../Dataset/'+file_name+'/neutral') as f:
            json.load(f)   
        print 'neutral  success'    

In [57]:
#test_all()
