In [None]:
import csv
import math
import random
import nltk
from nltk.corpus import wordnet
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from nltk import RegexpTokenizer, re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from num2words import num2words
from textblob import TextBlob

csvFilePath = '/Users/subbu/Documents/GitHub/TMP/sources/Data_smaller.csv'
negativeLabel = -1
positiveLabel = 1
columnOfReviewRating = 13
columnOfReviewText = 14
columnOfReviewTitle = 15
reviewCounter = 0
ENGLISH = "en"

listOfReviews = []
filteredReviewsList = []

tokenizer = RegexpTokenizer(r"\w+['´`]?[a-zA-Z]*")  # filter words only - but keep the hyphens


def openCSV(path):
    csvFile = open(path, 'r', encoding='latin1')
    next(csvFile)  # skip first line
    reader = csv.reader(csvFile)
    readFile(reader)
    csvFile.close()


def readFile(reader):
    global reviewCounter, listOfReviews

    for line in reader:
        # rating = line[columnOfReviewRating]  # may be useful to verify what our code found out
        reviewText = line[columnOfReviewText]  # column 14 contains the text of the review
        if englishLanguage(reviewText):
            listOfReviews.insert(reviewCounter, reviewText.lower())
            reviewCounter = reviewCounter + 1
    listOfReviews = list(dict.fromkeys(listOfReviews))  # remove duplicates


def englishLanguage(reviewText):
    try:
        language = detect(reviewText)
    except LangDetectException:
        return False

    return language == ENGLISH


stopWords = set(stopwords.words('english'))
importantStopwords = {'wouldn', 'don', "isn't", 'nor', "aren't", "couldn't", 'needn', "shouldn't", 'aren', "shan't",
                      "hadn't", 'haven', 'too', 'couldn', 'didn', "needn't", 'wasn', "mustn't", "doesn't", 'mightn',
                      "wasn't", "weren't", "haven't", 'mustn', "don't", "should've", 'weren', "didn't", 'shouldn',
                      "won't", 'not', 'no', 'hasn', 'ain', "hasn't", "mightn't", "wouldn't", 'doesn', 'hadn',
                      'very'}  # -'but'
stopWords = stopWords - importantStopwords


def removeStopwordsAndPunctuations(reviews, firstInput):
    filteredReview = []

    if not firstInput:
        reviewTokenized = tokenizer.tokenize(reviews)
        for token in reviewTokenized:
            if token not in stopWords:
                filteredReview.append(token.lower())
    else:
        for i in range(len(reviews)):

            reviewTokenized = tokenizer.tokenize(reviews[i])
            filteredReview = []
            for token in reviewTokenized:
                if token not in stopWords:
                    filteredReview.append(token.lower())
            filteredReviewsList.append(" ".join(filteredReview))
    return {'review': " ".join(filteredReview)}


def nltkTags(nltk_tag):
    if nltk_tag.startswith('J'):  # JJ
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):  # VBD
        return wordnet.VERB
    elif nltk_tag.startswith('N'):  # NN
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):  # RB
        return wordnet.ADV
    else:
        return None


def lemmatizeSentence(sentence):
    lemmatizer = WordNetLemmatizer()
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wn_tagged = map(lambda w: (w[0], nltkTags(w[1])), nltk_tagged)
    res_words = []
    for word, tag in wn_tagged:
        if tag is None:
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)


def convertNumtoWords(inputString, myList, index):
    splitedData = inputString.split(' ')
    for i in range(len(splitedData)):
        if splitedData[i].isdigit():
            number = int(splitedData[i])
            numberAsWord = num2words(number)
            splitedData[i] = str(numberAsWord)
            str1 = " ".join(splitedData)
            myList[index] = str1


def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)


def normalizeText():
    global filteredReviewsList
    for i in range(len(listOfReviews)):
        if hasNumbers(listOfReviews[i]):
            convertNumtoWords(listOfReviews[i], listOfReviews, i)
    removeStopwordsAndPunctuations(listOfReviews, True)

    bufList = []

    for i in range(len(filteredReviewsList)):
        for j in filteredReviewsList[i].split("."):
            bufList.append(lemmatizeSentence(j))

    filteredReviewsList.clear()
    filteredReviewsList = [bufList]
    # print(filteredReviewsList)


def printList(reviews):
    print("---------------------------------\n\nReviews:\n")
    for i in range(len(reviews)):
        print(str(i + 1) + ": " + reviews[i])
    print("\n---------------------------------")


def startTextMining():
    openCSV(csvFilePath)
    normalizeText()

startTextMining()

# In Training
features = {"service": ["parking", "pool", "garden", "wifi", "aircon", "spa", "gym", "restaurant", "party", "game"],
            "cleanliness": ["room", "lobby", "sheets", "floors", "tables", "dishes", "restaurant"],
            "location": ["airport", "bus", "taxi", "centre", "beach", "sea", "nature", "shop", "busy", "crowded"],
            "accomodation": ["bed", "couch", "chair", "size", "comfort", "toilets", "showers", "noisy", "view"],
            "food": ["vegan", "vegetarian", "variety", "health", "fresh", "hygiene", "buffet", "breakfast", "portion"],
            "staff": ["friendly", "reliable", "polite", "helpful", "speed", "available", "reception"]
            }

serviceArr = ["parking", "pool", "garden", "wifi", "aircon", "spa", "gym", "restaurant", "party", "game"]
accomodationArr = ["bed", "couch", "chair", "size", "comfort", "toilets", "showers", "noisy", "view"]
staffArr = ["friendly", "reliable", "polite", "helpful", "speed", "avialable", "reception"]
cleanlinessArr = ["room", "lobby", "sheets", "floors", "tables", "dishes", "restaurant"]
locationArr = ["airport", "bus", "taxi", "centre", "beach", "sea", "nature", "shop", "busy", "crowded"]
foodArr = ["vegan", "vegetarian", "variety", "health", "fresh", "hygiene", "buffet", "breakfast", "portion"]

featureArrays = [serviceArr, accomodationArr, staffArr, cleanlinessArr, locationArr, foodArr]

labeledList = []
allWords = []
ourFeatureWordsAccumulated = []

serviceRating = [0, 0, 0]
accomodationRating = [0, 0, 0]
staffRating = [0, 0, 0]
cleanlinessRating = [0, 0, 0]
locationRating = [0, 0, 0]
foodRating = [0, 0, 0]

ratingsOfFeatures = [serviceRating, accomodationRating, staffRating, cleanlinessRating, locationRating, foodRating]

for x in range(len(listOfReviews)):
    splitReview = re.split(r'[?!.]+', listOfReviews[x])
    sumOfPolarities = 0

    for counter in range(len(splitReview)):
        found = False
        currentPartOfReview = ""

        for feature in features:
            currentPartOfReview = splitReview[counter]

            if feature in currentPartOfReview:
                found = True
                ourFeatureWordsAccumulated.append(feature)

            for keyword in features.get(feature):

                r = splitReview[counter]
                if keyword in currentPartOfReview:
                    found = True
                    ourFeatureWordsAccumulated.append(keyword)
        if found:
            sentiment = TextBlob(currentPartOfReview).sentiment
            filteredSentence = removeStopwordsAndPunctuations(currentPartOfReview, False)
            tokenizedFilteredSentence = tokenizer.tokenize(filteredSentence['review'])

            lastAdded = ourFeatureWordsAccumulated[len(ourFeatureWordsAccumulated) - 1]
            n = 0
            for array in featureArrays:
                if lastAdded in array:
                    ratingsOfFeatures[n] = [ratingsOfFeatures[n][0] + sentiment[0], ratingsOfFeatures[n][1] + (
                        negativeLabel if sentiment[0] < 0 else positiveLabel), ratingsOfFeatures[n][2] + 1]
                n = n + 1
            labeledList.append((filteredSentence, negativeLabel if sentiment[0] < 0 else positiveLabel))

            allWords.extend(tokenizedFilteredSentence)

print("\n\nAccording to TextBlob our feature-categories have the following rating (accumulated -1 or +1):\n")
print("\nRating of Service")
print(ratingsOfFeatures[0][1] / (ratingsOfFeatures[0][2] if ratingsOfFeatures[0][2] > 0 else 1))
print("\nRating of Accommodation")
print(ratingsOfFeatures[1][1] / (ratingsOfFeatures[1][2] if ratingsOfFeatures[1][2] > 0 else 1))
print("\nRating of Staff")
print(ratingsOfFeatures[2][1] / (ratingsOfFeatures[2][2] if ratingsOfFeatures[2][2] > 0 else 1))
print("\nRating of Cleanliness")
print(ratingsOfFeatures[3][1] / (ratingsOfFeatures[3][2] if ratingsOfFeatures[3][2] > 0 else 1))
print("\nRating of Location")
print(ratingsOfFeatures[4][1] / (ratingsOfFeatures[4][2] if ratingsOfFeatures[4][2] > 0 else 1))
print("\nRating of Food:")
print(ratingsOfFeatures[5][1] / (ratingsOfFeatures[5][2] if ratingsOfFeatures[5][2] > 0 else 1))

print("\n\n(accumulated floats between -1.0 and 1.0)")
print("\nRating of Service")
print(ratingsOfFeatures[0][0] / (ratingsOfFeatures[0][2] if ratingsOfFeatures[0][2] > 0 else 1))
print("\nRating of Accommodation")
print(ratingsOfFeatures[1][0] / (ratingsOfFeatures[1][2] if ratingsOfFeatures[1][2] > 0 else 1))
print("\nRating of Staff")
print(ratingsOfFeatures[2][0] / (ratingsOfFeatures[2][2] if ratingsOfFeatures[2][2] > 0 else 1))
print("\nRating of Cleanliness")
print(ratingsOfFeatures[3][0] / (ratingsOfFeatures[3][2] if ratingsOfFeatures[3][2] > 0 else 1))
print("\nRating of Location")
print(ratingsOfFeatures[4][0] / (ratingsOfFeatures[4][2] if ratingsOfFeatures[4][2] > 0 else 1))
print("\nRating of Food:")
print(ratingsOfFeatures[5][0] / (ratingsOfFeatures[5][2] if ratingsOfFeatures[5][2] > 0 else 1))

random.shuffle(labeledList)
freqDist = nltk.FreqDist(allWords)
wordFeatures = list(freqDist.most_common(50))[:50]
freqDist2 = nltk.FreqDist(ourFeatureWordsAccumulated)


def documentFeatures(sentence, test):
    setOfSingleReview = set()
    myFeatures = {}
    # for word in wordFeatures:
    for aspect in features:
        myFeatures[aspect] = (aspect in sentence)
        for key in features.get(aspect):
            # myFeatures['contains({})'.format(word[0])] = (word[0] in setOfSingleReview)
            myFeatures[key] = (key in (sentence if not test else setOfSingleReview))
    print(myFeatures)
    return myFeatures


featureSets = [(documentFeatures(singleReview['review'], False), polarity) for (singleReview, polarity) in labeledList]

setSize = math.ceil(len(featureSets) * 0.8)
train_set, test_set = featureSets[:setSize], featureSets[setSize:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

testString = \
    removeStopwordsAndPunctuations("Small but cozy room. Facilities were clean but the food was not good.", False)[
        'review']

classifierTest = documentFeatures(testString, False)

classifier.show_most_informative_features(10)
