### Cleaning the dataset.
    * Remove NA values
    * Take Overall sentiment of the review, and add new column where each record is given either positive, negative or neutral sentiment.

In [1]:
import nltk
import json
from nltk.tokenize import sent_tokenize, word_tokenize


In [11]:
# function that loads a lexicon of positive words to a set and returns the set
def loadLexicon(fname):
    newLex = set()
    lex_conn = open(fname)

    # add every word in the file to the set
    for line in lex_conn:
        newLex.add(line.strip())  # remember to strip to remove the lin-change character
    lex_conn.close()
    return newLex


def lexicalAnalysis(sentences, posLex, negLex):
    nouns_in_review = {}
    for sentence in sentences:  # for each sentence
        words = word_tokenize(sentence)  # split the review into words
        tagged_words = nltk.pos_tag(words)  # POS tagging for the words in the sentence
        nouns_in_sentence = set()  # set of all the nouns in the sentence
        tags = {}  # positive and negative of the sentence
        for tagged_word in tagged_words:
            # print(tagged_word)
            if tagged_word[1].startswith('NN'):  # if the word is a noun
                noun = tagged_word[0].lower()  # lower case the noun
                if len(noun) < 3: continue  # ignore nouns with less than 3 characters
                nouns_in_sentence.add(noun)  # add the noun to the set

            positive_count = tags.get('positive', 0)
            negative_count = tags.get('negative', 0)

            if tagged_word[1].startswith('JJ'):
                if tagged_word[0].lower() in posLex:
                    tags.update({'positive': positive_count + 1})

                if tagged_word[0].lower() in negLex:
                    tags.update({'negative': negative_count + 1})
                    
            if tagged_word[1].startswith('VB'):
                if tagged_word[0].lower() in posLex:
                    tags.update({'positive': positive_count + 1})

                if tagged_word[0].lower() in negLex:
                    tags.update({'negative': negative_count + 1})

        nouns_list = list(nouns_in_sentence)
        for n in nouns_list:
            cur_tags = nouns_in_review.get(n, {})
            nouns_in_review.update({
                n: {
                    'positive': tags.get('positive', 0) + cur_tags.get('positive', 0),
                    'negative': tags.get('negative', 0) + cur_tags.get('negative', 0)
                }
            })

    return nouns_in_review


def parse(text):
    # load the positive and negative lexicons into sets
    posLex = loadLexicon('positive-words.txt')
    negLex = loadLexicon('negative-words.txt')
    sentences = sent_tokenize(text)  # split the review into sentences
    sentiment = lexicalAnalysis(sentences, posLex, negLex)
    return sentiment


def conclude_sentiment_per_noun(sentiment_map):
    positive = 0
    negative = 0
    for noun in sentiment_map.keys():
        if (sentiment_map[noun]['positive'] - sentiment_map[noun]['negative']) > 0:
            positive += 1
        else:
            negative += 1

    if (positive - negative) > 0:
        return 'positive'
    elif (positive - negative) == 0:
        return 'neutral'
    else:
        return 'negative'


def clean_interviews():
    interviews_file = open('tcs_interviews.json')
    interviews = json.load(interviews_file)
    cleaned_interviews = []

    for item in interviews:
        if not ((item['level_tag'] == 'NA') or (item['exp_tag'] == 'NA') or (item['offer_tag'] == 'NA')):
            sentiment_map = parse(item['text'])
            sentiment1 = conclude_sentiment_per_noun(sentiment_map)
            item['sentiment'] = sentiment1
            cleaned_interviews.append(item)

    clean_file = open("tcs_interviews_clean.json", "w")
    json.dump(cleaned_interviews, clean_file, indent=6)

    clean_file.close()
    interviews_file.close()


# def checkDiff():
#     interviews_file = open('tcs_interviews_clean.json')
#     interviews = json.load(interviews_file)
#     count = 0
#     for interview in interviews:
#         if interview['overall_sentiment'] != interview['per_noun_sentiment']:
#             count += 1

#     print(count) # No. of mismatch sentiments: 2354


# checkDiff()



In [12]:
clean_interviews()
print('Cleaned and saved the data to tcs_interviews_clean.json')

Cleaned and saved the data to tcs_interviews_clean.json
