In [1]:
import pandas as pd 
import json
import numpy as np 
import matplotlib.pyplot as plt 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
swords = set(stopwords.words('english'))

In [2]:
df = pd.read_json("./data/data71.json")
df1 = pd.read_json("./data/data70.json")

df = pd.concat([df, df1])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82417 entries, 0 to 39928
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       82417 non-null  object
 1   message    82417 non-null  object
 2   timestamp  82417 non-null  object
dtypes: object(3)
memory usage: 2.5+ MB


In [3]:
universities = [
    "Singapore Management University|smu",
    "nanyang technological university|ntu",
    "National University of Singapore|nus",
    "Singapore University of Technology and Design|sutd"
]

infoSys = "information systems|it|information technology|data analytics|computer science|com|computer|info systems|info sys"
simplifiedInfoSys = "computer|com|infosys|information systems|analytics"
simplifiedJC = "rank|a-level|a level|"

polyjc = []
with open("./data/polyjc.txt", "r") as f:
    for line in f:
        polyjc.append(line.strip()) 

'''
The Positive, Negative and Neutral scores represent the proportion of text that falls in these categories. 
This means our sentence was rated as 67% Positive, 33% Neutral and 0% Negative. Hence all these should add up to 1.
The Compound score is a metric that calculates the sum of all the lexicon ratings 
which have been normalized between -1(most extreme negative) and +1 (most extreme positive).
'''

def getScore(sentence): # return dict, e.g. --> {'neg': 0.0, 'neu': 0.326, 'pos': 0.674, 'compound': 0.7351}
    analyser = SentimentIntensityAnalyzer()
    return analyser.polarity_scores(sentence)

def writeFile(filepath, header, uni, startYear, endYear, queryRegexArray, domain=""):
    df = pd.read_json("./data/data71.json")
    # df1 = pd.read_json("./data/data70.json")
    # df = pd.concat([df, df1])

    f = open(filepath, "w+")
    f.write(header)
    for year in range(startYear, endYear + 1):
        dfYear = df[df["timestamp"].str.contains(str(year))]
        yearStr = str(year)
        for term in queryRegexArray:
            if domain != "" :
                term += "|" + domain
            listOfComments = dfYear[ dfYear["message"].str.contains(term, case=False) & dfYear["message"].str.contains(uni, case=False)]["message"].values.tolist()
            score = 0
            numOfComments = len(listOfComments) 
            for comment in listOfComments:
                score += getScore(comment)["compound"]
            overallScore = 0
            if numOfComments != 0:
                overallScore = score / numOfComments
            yearStr += "," + str(overallScore)
        f.write(yearStr)
        f.write("\n")
    f.close()

def printWeightiestSentences(filepath, header, uni, startYear, endYear, queryRegexArray, numOfSentences, domain=""):
    df = pd.read_json("./data/data70.json")
    df1 = pd.read_json("./data/data71.json")
    df = pd.concat([df, df1])

    f = open(filepath, "w+")
    f.write(header)

    for year in range(startYear, endYear + 1):
        yearStr = str(year)
        dfYear = df[df["timestamp"].str.contains(yearStr, na=False)]
        for term in queryRegexArray:
            if domain != "":
                term = term + "|" + domain
            listOfComments = dfYear[ dfYear["message"].str.contains(term, case=False, na=False) & dfYear["message"].str.contains(uni, case=False, na=False) ]["message"].values.tolist()

            vectorizer = TfidfVectorizer(stop_words=swords)

            X = vectorizer.fit_transform(listOfComments)

            feature_names = vectorizer.get_feature_names()

            vocab = vectorizer.vocabulary_

            unsorted_result = {}

            for i in range(len(list(X.toarray()))) :
                row = list(list(X.toarray())[i])
                unsorted_result[listOfComments[i]] = sum(row)
            
            result = pd.DataFrame()
            result["sentence"] = unsorted_result.keys()
            result["value"] = unsorted_result.values()
            df = result.sort_values(by=["value"], ascending=False)
            top10sentences = df.nlargest(num, "value")["sentence"].tolist()
            top10values = df.nlargest(num, "value")["value"].tolist()
            for i in range(num):
                line = str(year) + "," + uni + "," + term + "," + top10sentences[i] + "," + str(top10values[i]) + "\n"
                f.write(line)
                
    f.close()

In [None]:
"""
Diagram of result dataframe
+------+--------+--------+-----+------+----
| Year |   NP   |   SP   | ... | ACJC | ... 
+------+--------+--------+-----+------+----
| 2014 | 0.9872 | -0.023 | ... | 0.33 | ...
"""

In [5]:
# poly
for uni in universities:
    uniName = uni.split("|")[-1]
    pathname = f"./output/{uniName} - poly.csv"
    header = "year"
    polys = []
    for sch in polyjc[:5]:
        header += "," + sch.split("|")[-1]
        polys.append(sch)
    header += "\n"

    writeFile(pathname, header, uni, 2014, 2020, polys, simplifiedInfoSys)

In [6]:
#JC
for uni in universities:
    uniName = uni.split("|")[-1]
    pathname = f"./output/{uniName} - jc.csv"
    header = "year"
    schools = []
    for sch in polyjc[5:]:
        header += "," + sch.split("|")[-1]
        schools.append(sch)
    header += "\n"

    writeFile(pathname, header, uni, 2014, 2020, schools, simplifiedInfoSys)

In [8]:
#JC Overall
for uni in universities:
    uniName = uni.split("|")[-1]
    pathname = f"./output/{uniName} - JC (Overall).csv"
    header = "year,score\n"

    writeFile(pathname, header, uni, 2014, 2020, ["jc"], simplifiedInfoSys)

In [None]:
# IS Related Courses - for all uni
timents for all uni (IS).csv", "w+")
f.write("year,smu,ntu,nus,sutd\n")

for year in range(2014, 2022):
    yearStr = str(year)
    for uni in universities:
        listOfComments = df[ df["message"].str.lower().str.contains(uni) & df["timestamp"].str.contains(str(year)) & df["message"].str.lower().str.contains(infoSys) ]["message"].values.tolist()
        score = 0
        numOfComments = len(listOfComments) 
        for comment in listOfComments:
            score += getScore(comment)["compound"]

        if numOfComments == 0:
            overallScore = 0
        else:
            overallScore = score / numOfComments
        yearStr += "," + str(overallScore)
    f.write(yearStr)
    f.write("\n")
f.close()

In [4]:
filepath = "./output/tempWeightedSentences.csv"
header = "year,uni,search term,sentence,tfidf value\n"
uni = "Singapore Management University|smu"
start = 2014
end = 2021
arr = polyjc
num = 10

printWeightiestSentences(filepath, header, uni, start, end, arr, num, simplifiedInfoSys)

KeyError: 'timestamp'