In [1]:
import nltk
import tensorflow as tf
import matplotlib
import json
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from random import randrange
from nltk.corpus import stopwords
import string
import spacy
import pandas as pd
import os
import sys

In [2]:
#tokenise all the words with the help of a tokeniser
# for model
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer() #num_words is the tokeniser that fits the number of words

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D #find out how come they using different types of drop outs
from tensorflow.keras.layers import Embedding
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# function to use trained model to get sentiment
def predict_sentiment(text):#note that 1 denotes positive and 0 denotes negative
    model = keras.models.load_model('model')
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    if prediction ==1:
        print("Predicted label: Good sentiment")
    else:
       print("Predicted label: Bad sentiment")

# Elastic Search

Search for reviews based on user input

In [4]:
with open('reviewSelected100.json') as f:
    data = json.loads("[" + 
        f.read().replace("}\n{", "},\n{") + 
    "]")

In [5]:
unique_businesses = set()
for review in data:
    unique_businesses.add(review['business_id'])
print("Number of businesses: " + str(len(unique_businesses)))

Number of businesses: 153


In [6]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [7]:
es = Elasticsearch(HOST="http://localhost", PORT=9200)

In [8]:
# Define body of index
body={
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 0,
        'index': {
          'sort.field': 'date',
          'sort.order': 'asc'
        },

        # custom analyzer
        'analysis': {
            'analyzer': {
                'review_analyzer': {
                    'type': 'custom',
                      'tokenizer': 'standard',
                      'filter': ['lowercase', 'english_stop', 'porter_stem']
                    }
                  },
            'filter': {
                'english_stop': { 
                'type': 'stop',
                'stopwords': '_english_'
                }
            }
        }
    },
    'mappings': {
        'properties': {
            'text': {
                'type': 'text',
                'analyzer': 'review_analyzer',
                'search_analyzer': 'review_analyzer'
            },
            'date': {
                'type': 'date',
                'format': 'yyyy-MM-dd HH:mm:ss'
            }
        }
    }
}

In [9]:
def review_generator(data):
    for review in data:
        yield {
                "_index": index_name,
                "_type": "_doc",
                "_id" : f"{review['review_id']}",
                "_source": review,
            }

In [10]:
index_name = "review-index"
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=body)
    helpers.bulk(es, review_generator(data))
    print("Index created")
else:
    print("Index already exists")

Index already exists


In [11]:
def retrieveUniqueReviewsByInput(num_of_reviews, userInput, es, index_name):
    doc_count = 0
    reviews = []
    query = {
        "size": 100,
        "query": {
            "query_string": {
                "query": "customer"
            }
        }
    }
    # Make a search request
    res = es.search(index=index_name, body=query, scroll='2s')
    
    for doc in res['hits']['hits']:
        print("\n", doc['_id'], doc['_source']['business_id'], doc['_source']['text'], doc['_source']['date'], doc['_score'])
        doc_count += 1
        print("DOC COUNT:", doc_count)
        reviews.append(doc['_source'])
    
    old_scroll_id = res['_scroll_id']
    
    while len(res['hits']['hits']):
        res = es.scroll(scroll_id=old_scroll_id, scroll='2s')
        if old_scroll_id != res['_scroll_id']:
            print("New scroll id: " + res['_scroll_id'])
        old_scroll_id = res['_scroll_id']
        
        print("\nResponse for index:", index_name)
        print("Scroll ID:", res['_scroll_id'])
        print("Total Hits:", res['hits']['total']['value'])
        
        # Iterate over hits for each scroll
        for doc in res['hits']['hits']:
            print("\n", doc['_id'], doc['_source']['text'], doc['_source']['date'], doc['_score'])
            doc_count += 1
            print("DOC COUNT:", doc_count)
            reviews.append(doc['_source'])
        
    print("\nTOTAL DOC COUNT:", doc_count)
    
    # From the reviews retrieved, extract X reviews, one from each unique business
    unique_reviews = []
    business_set = set()
    while len(unique_reviews) < num_of_reviews:
        # Find a random review
        random_number = randrange(doc_count)
        # Don't take review if same business already taken
        if reviews[random_number]['business_id'] in business_set:
            continue
        else:
            business_set.add(reviews[random_number]['business_id'])
            unique_reviews.append(reviews[random_number])
    return unique_reviews

In [None]:
# Get the reviews
reviews_input = retrieveUniqueReviewsByInput(50, "customer", es, index_name)