In [1]:
import pandas as pd 
import json
import numpy as np 
import matplotlib.pyplot as plt 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
swords = set(stopwords.words('english'))
import csv

In [None]:
df = pd.read_json("./data/data71.json")
df1 = pd.read_json("./data/data70.json")

df = pd.concat([df, df1])
df.info()

In [2]:
universities = [
    "Singapore Management University|smu",
    "nanyang technological university|ntu",
    "National University of Singapore|nus",
    "Singapore University of Technology and Design|sutd"
]

keywords = []
with open("./data/keywords.txt", "r") as f:
    for line in f:
        keywords.append(line.strip())

infoSys = "information systems|it|information technology|data analytics|computer science|com|computer|info systems|info sys"
simplifiedInfoSys = "computer|com|infosys|information systems|analytics|smart city"
simplifiedJC = "rank|a-level|a level|"

poly = []
with open("./data/poly.txt", "r") as f:
    for line in f:
        poly.append(line.strip()) 

lowtier = []
with open("./data/lowTierJC.txt", "r") as f:
    for line in f:
        lowtier.append(line.strip()) 

midtier = []
with open("./data/midTierJC.txt", "r") as f:
    for line in f:
        midtier.append(line.strip()) 

hightier = []
with open("./data/highTierJC.txt", "r") as f:
    for line in f:
        hightier.append(line.strip()) 


'''
The Positive, Negative and Neutral scores represent the proportion of text that falls in these categories. 
This means our sentence was rated as 67% Positive, 33% Neutral and 0% Negative. Hence all these should add up to 1.
The Compound score is a metric that calculates the sum of all the lexicon ratings 
which have been normalized between -1(most extreme negative) and +1 (most extreme positive).
'''

def getScore(sentence): # return dict, e.g. --> {'neg': 0.0, 'neu': 0.326, 'pos': 0.674, 'compound': 0.7351}
    analyser = SentimentIntensityAnalyzer()
    return analyser.polarity_scores(sentence)

def writeFile(filepath, headerArr, uni, startYear, endYear, terms, domain=""):
    df2 = pd.read_json("./data/data71.json")
    df1 = pd.read_json("./data/data70.json")
    df1["timestamp"] = df1["timestamp"].astype(str)
    df1 = df1[df1["name"].str.contains("sneakpeek_bot") == False]
    df = pd.concat([df2, df1])

    f = open(filepath, "w+")

    csvWriter = csv.writer(f)
    csvWriter.writerow(headerArr)

    for year in range(startYear, endYear + 1):

        dfYear = df[df["timestamp"].str.contains(str(year))]
        yearStr = str(year)

        for term in terms:

            if domain == "" :
                listOfComments = dfYear[ dfYear["message"].str.contains(term, case=False) & dfYear["message"].str.contains(uni, case=False)]["message"].values.tolist()
            else :
                listOfComments = dfYear[ dfYear["message"].str.contains(term, case=False) & dfYear["message"].str.contains(uni, case=False) & dfYear["message"].str.contains(domain, case=False)]["message"].values.tolist()

            score = 0
            numOfComments = len(listOfComments) 
            # negative means < -0.10
            # positive means > 0.10
            # neutral mean >= -0.10 and <= 0.10
            neg_count = 0
            pos_count = 0
            neu_count = 0

            for comment in listOfComments:
                temp_score = getScore(comment)["compound"]
                score += temp_score
                if temp_score < -0.10:
                    neg_count += 1
                elif temp_score > 0.10:
                    pos_count += 1
                else:
                    neu_count += 1

            overallScore = 0
            if numOfComments != 0:
                overallScore = score / numOfComments
            row = [year, term, overallScore, numOfComments, neg_count, neu_count, pos_count]
            csvWriter.writerow(row)
    f.close()

def printWeightiestSentences(filepath, headerArr, uni, startYear, endYear, terms, numOfSentences, domain=""):
    if domain != "" :
        uni += "|" + domain

    f = open(filepath, "w+")

    csvWriter = csv.writer(f)
    csvWriter.writerow(headerArr)

    for year in range(startYear, endYear + 1):
        df1 = pd.read_json("./data/data70.json")
        df2 = pd.read_json("./data/data71.json")
        df1["timestamp"] = df1["timestamp"].astype(str)
        df1 = df1[df1["name"].str.contains("sneakpeek_bot") == False]
        df = pd.concat([df1, df2])

        yearStr = str(year)
        dfYear = df[df["timestamp"].str.contains(yearStr)]
        for term in terms:

            if domain == "" :
                listOfComments = dfYear[ dfYear["message"].str.contains(term, case=False) & dfYear["message"].str.contains(uni, case=False)]["message"].values.tolist()
            else :
                listOfComments = dfYear[ dfYear["message"].str.contains(term, case=False) & dfYear["message"].str.contains(uni, case=False) & dfYear["message"].str.contains(domain, case=False)]["message"].values.tolist()

            vectorizer = TfidfVectorizer(stop_words=swords)

            if len(listOfComments) > 0:
                if numOfSentences > len(listOfComments):
                    num = len(listOfComments)
                else :
                    num = numOfSentences    

                X = vectorizer.fit_transform(listOfComments)

                feature_names = vectorizer.get_feature_names()

                vocab = vectorizer.vocabulary_

                unsorted_result = {}

                for i in range(len(list(X.toarray()))) :
                    row = list(list(X.toarray())[i])
                    unsorted_result[listOfComments[i]] = sum(row)
                
                result = pd.DataFrame()
                result["sentence"] = unsorted_result.keys()
                result["value"] = unsorted_result.values()
                df = result.sort_values(by=["value"], ascending=False)

                top10sentences = df.nlargest(num, "value")["sentence"].tolist()
                top10values = df.nlargest(num, "value")["value"].tolist()

                for i in range(num):
                    # line = str(year) + "," + uni + "," + term + ",\"" + top10sentences[i] + "\"," + str(top10values[i]) + "\n"
                    tempArr = [str(year), uni, term, str(top10sentences[i]), top10values[i]]
                    csvWriter.writerow(tempArr)
            
            else :
                tempArr = [str(year), uni, term, "", 0]
                csvWriter.writerow(tempArr)
                
    f.close()

In [None]:
"""
Diagram of result dataframe
+------+--------+--------+-----+------+----
| Year |   NP   |   SP   | ... | ACJC | ... 
+------+--------+--------+-----+------+----
| 2014 | 0.9872 | -0.023 | ... | 0.33 | ...
"""

In [None]:
#JC Overall
for uni in universities:
    uniName = uni.split("|")[-1]
    pathname = f"./output/{uniName} - JC (Overall).csv"
    header = "year,score\n"

    writeFile(pathname, header, uni, 2014, 2020, simplifiedJC, simplifiedInfoSys)

In [None]:
# low tier JC sentiment
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - lowtier.csv"
    header = ["year", "search term", "sentiment score", "Number of comments"]
    start = 2014
    end = 2021
    searchTerms = lowtier
    domain = simplifiedInfoSys

    writeFile(pathname, header, uniRegex, start, end, searchTerms, domain)

In [None]:
# low tier jc weighted sentences
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - lowtier.csv"
    header = ["year", "search term", "sentence", "tfidf score"]
    start = 2014
    end = 2021
    searchTerms = lowtier
    domain = simplifiedInfoSys
    print (uniRegex)
    printWeightiestSentences(pathname, header, uniRegex, start, end, searchTerms, domain)

In [None]:
# mid tier JC
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - midtier.csv"
    header = ["year", "search term", "sentiment score", "Number of comments"]
    start = 2014
    end = 2021
    searchTerms = midtier
    domain = simplifiedInfoSys

    writeFile(pathname, header, uniRegex, start, end, searchTerms, domain)

In [None]:
# high tier JC
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - hightier.csv"
    header = ["year", "search term", "sentiment score", "Number of comments"]
    start = 2014
    end = 2021
    searchTerms = hightier
    domain = simplifiedInfoSys

    writeFile(pathname, header, uniRegex, start, end, searchTerms, domain)

In [None]:
# combined tiers JC
lowtierRegex = "|".join(lowtier)
midtierRegex = "|".join(midtier)
hightierRegex = "|".join(hightier)

combinedTiers = [lowtierRegex, midtierRegex, hightierRegex]

for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - JC combined tiers.csv"
    header = ["year", "search term", "sentiment score", "Number of comments", "Negative Comments", "Neutral Comments", "Positive Comments"]
    start = 2014
    end = 2021
    searchTerms = combinedTiers
    domain = simplifiedInfoSys
    print (uniRegex)
    writeFile(pathname, header, uniRegex, start, end, searchTerms, domain)

In [None]:
# combined JC Weighted Sentences
# def printWeightiestSentences(filepath, headerArr, uni, startYear, endYear, terms, numOfSentences, domain=""):
lowtierRegex = "|".join(lowtier)
midtierRegex = "|".join(midtier)
hightierRegex = "|".join(hightier)

combinedTiers = [lowtierRegex, midtierRegex, hightierRegex]

for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - JC combined tiers Weighted Sentences.csv"
    header = ["year", "search term", "sentence", "score"]
    start = 2014
    end = 2021
    searchTerms = combinedTiers
    domain = simplifiedInfoSys
    print (uniRegex)
    printWeightiestSentences(pathname, header, uniRegex, start, end, searchTerms, 10, domain)

In [None]:
# combined tiers JC without infosys
lowtierRegex = "|".join(lowtier)
midtierRegex = "|".join(midtier)
hightierRegex = "|".join(hightier)

combinedTiers = [lowtierRegex, midtierRegex, hightierRegex]

for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - JC combined tiers Weighted Sentences without infosys.csv"
    header = ["year", "search term", "sentiment score", "Number of comments", "Negative Comments", "Neutral Comments", "Positive Comments"]
    start = 2014
    end = 2021
    searchTerms = combinedTiers
    domain =Sys
    print (uni)
    writeFile(pathname, header, uniRegex, start, end, searchTerms)

In [None]:
# combined JC Weighted Sentences without infosys
# def printWeightiestSentences(filepath, headerArr, uni, startYear, endYear, terms, numOfSentences, domain=""):
lowtierRegex = "|".join(lowtier)
midtierRegex = "|".join(midtier)
hightierRegex = "|".join(hightier)

combinedTiers = [lowtierRegex, midtierRegex, hightierRegex]

for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - JC combined tiers Weighted Sentences.csv"
    header = ["year", "search term", "sentiment score", "Number of comments", "Negative Comments", "Neutral Comments", "Positive Comments"]
    start = 2014
    end = 2021
    searchTerms = combinedTiers
    domain = simplifiedInfoSys
    print (uniRegex)
    printWeightiestSentences(pathname, header, uniRegex, start, end, searchTerms, 10, domain)

In [None]:
# polytechnics
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - poly.csv"
    header = ["year", "search term", "sentiment score", "Number of comments", "Negative Comments", "Neutral Comments", "Positive Comments"]
    start = 2014
    end = 2021
    searchTerms = poly
    domain = simplifiedInfoSys

    writeFile(pathname, header, uniRegex, start, end, searchTerms, domain)

In [3]:
# polytechnics weighted sentences
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - poly Weighted Sentences.csv"
    header = ["year", "search term", "sentence", "score"]
    start = 2014
    end = 2021
    searchTerms = poly
    domain = simplifiedInfoSys
    print (uniRegex)
    printWeightiestSentences(pathname, header, uniRegex, start, end, searchTerms, 10, domain)

Singapore Management University|smu
nanyang technological university|ntu
National University of Singapore|nus
Singapore University of Technology and Design|sutd


In [4]:
# keywords
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - keywords.csv"
    header = ["year", "search term", "sentiment score", "Number of comments", "Negative Comments", "Neutral Comments", "Positive Comments"]
    start = 2014
    end = 2020
    searchTerms = keywords
    domain = simplifiedInfoSys

    writeFile(pathname, header, uniRegex, start, end, searchTerms, domain)

In [5]:
# keywords weighted sentences
for uniRegex in universities:
    uniName = uniRegex.split("|")[-1]
    pathname = f"./output/{uniName} - keywords Weighted Sentences.csv"
    header = ["year", "search term", "sentence", "score"]
    start = 2014
    end = 2020
    searchTerms = keywords
    domain = simplifiedInfoSys
    print (uniRegex)
    printWeightiestSentences(pathname, header, uniRegex, start, end, searchTerms, 10, domain)

Singapore Management University|smu
nanyang technological university|ntu
National University of Singapore|nus
Singapore University of Technology and Design|sutd
