In [1]:
import pandas as pd 
import json
import numpy as np 
import matplotlib.pyplot as plt 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [62]:
df = pd.read_json("./data/data70.json")
df1 = pd.read_json("./data/data71.json")

df = pd.concat([df, df1])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42724 entries, 0 to 42487
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       42724 non-null  object
 1   message    42724 non-null  object
 2   timestamp  42724 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [48]:
universities = [
    "Singapore Management University|smu",
    "nanyang technological university|ntu",
    "National University of Singapore|nus",
    "Singapore University of Technology and Design|sutd"
]

infoSys = "information systems|it|information technology|data analytics|computer science|com|computer|info systems|info sys"
simplifiedInfoSys = "computer|com|infosys|information systems|analytics"
simplifiedJC = "rank|a-level|a level|"

polyjc = []
with open("./data/polyjc.txt", "r") as f:
    for line in f:
        polyjc.append(line.strip()) 

def getScore(sentence): # return dict, e.g. --> {'neg': 0.0, 'neu': 0.326, 'pos': 0.674, 'compound': 0.7351}
    analyser = SentimentIntensityAnalyzer()
    return analyser.polarity_scores(sentence)

def writeFile(filepath, header, uni, startYear, endYear, queryRegexArray, domain=" "):
    df = pd.read_json("./data/data70.json")
    df1 = pd.read_json("./data/data71.json")
    df = pd.concat([df, df1])

    f = open(filepath, "w+")
    f.write(header)
    for year in range(startYear, endYear + 1):
        yearStr = str(year)
        for term in queryRegexArray:
            listOfComments = df[ df["message"].str.contains(term, case=False) & df["message"].str.contains(uni, case=False) & df["timestamp"].str.contains(str(year)) & df["message"].str.contains(domain, case=False) ]["message"].values.tolist()
            score = 0
            numOfComments = len(listOfComments) 
            for comment in listOfComments:
                score += getScore(comment)["compound"]
            overallScore = 0
            if numOfComments != 0:
                overallScore = score / numOfComments
            yearStr += "," + str(overallScore)
        f.write(yearStr)
        f.write("\n")
    f.close()

def writeFile2(filepath, header, uni, startYear, endYear, queryRegexArray):
    df = pd.read_json("./data/data70.json")
    df1 = pd.read_json("./data/data71.json")
    df = pd.concat([df, df1])

    f = open(filepath, "w+")
    f.write(header)

    for year in range(startYear, endYear + 1):
        yearStr = str(year)
        for term in queryRegexArray:
            listOfComments = df[ df["message"].str.contains(term, case=False) & df["message"].str.contains(uni, case=False) & df["timestamp"].str.contains(str(year))]["message"].values.tolist()
            score = 0
            numOfComments = len(listOfComments) 
            if numOfComments == 0:
                numOfComments = 1
            for comment in listOfComments:
                score += getScore(comment)["compound"]
            overallScore = score / numOfComments
            yearStr += "," + str(overallScore)
        f.write(yearStr)
        f.write("\n")
    f.close()

'''
The Positive, Negative and Neutral scores represent the proportion of text that falls in these categories. 
This means our sentence was rated as 67% Positive, 33% Neutral and 0% Negative. Hence all these should add up to 1.
The Compound score is a metric that calculates the sum of all the lexicon ratings 
which have been normalized between -1(most extreme negative) and +1 (most extreme positive).
'''


'\nThe Positive, Negative and Neutral scores represent the proportion of text that falls in these categories. \nThis means our sentence was rated as 67% Positive, 33% Neutral and 0% Negative. Hence all these should add up to 1.\nThe Compound score is a metric that calculates the sum of all the lexicon ratings \nwhich have been normalized between -1(most extreme negative) and +1 (most extreme positive).\n'

In [4]:
"""
Diagram of result dataframe
+------+--------+--------+-----+------+----
| Year |   NP   |   SP   | ... | ACJC | ... 
+------+--------+--------+-----+------+----
| 2014 | 0.9872 | -0.023 | ... | 0.33 | ...
"""

'\nDiagram of result dataframe\n+------+--------+--------+-----+------+----\n| Year |   NP   |   SP   | ... | ACJC | ... \n+------+--------+--------+-----+------+----\n| 2014 | 0.9872 | -0.023 | ... | 0.33 | ...\n'

In [39]:
# poly
for uni in universities:
    uniName = uni.split("|")[-1]
    pathname = f"./output/{uniName} - poly.csv"
    header = "year"
    schools = []
    for sch in polyjc[:5]:
        header += "," + sch.split("|")[-1]
        schools.append(sch)
    header += "\n"

    writeFile(pathname, header, uni, 2014, 2020, schools, simplifiedInfoSys)

In [49]:
#JC
for uni in universities:
    uniName = uni.split("|")[-1]
    pathname = f"./output/{uniName} - jc.csv"
    header = "year"
    schools = []
    for sch in polyjc[5:]:
        header += "," + sch.split("|")[-1]
        schools.append(sch)
    header += "\n"

    writeFile2(pathname, header, uni, 2014, 2020, schools)

In [76]:
#JC Overall
for uni in universities:
    uniName = uni.split("|")[-1]
    pathname = f"./output/{uniName} - JC (Overall).csv"
    header = "year,score\n"

    writeFile2(pathname, header, uni, 2014, 2020, ["jc"])

In [30]:
# IS Related Courses - for all uni
timents for all uni (IS).csv", "w+")
f.write("year,smu,ntu,nus,sutd\n")

for year in range(2014, 2022):
    yearStr = str(year)
    for uni in universities:
        listOfComments = df[ df["message"].str.lower().str.contains(uni) & df["timestamp"].str.contains(str(year)) & df["message"].str.lower().str.contains(infoSys) ]["message"].values.tolist()
        score = 0
        numOfComments = len(listOfComments) 
        for comment in listOfComments:
            score += getScore(comment)["compound"]

        if numOfComments == 0:
            overallScore = 0
        else:
            overallScore = score / numOfComments
        yearStr += "," + str(overallScore)
    f.write(yearStr)
    f.write("\n")
f.close()

In [72]:
listOfComments = df[ df["message"].str.contains("jurong", case=False) & df["message"].str.contains("ntu", case=False) & df["timestamp"].str.contains(str(2018))]["message"].values.tolist()
print (len(listOfComments))

2


In [74]:
df[df["message"].str.contains("junior college", case=False)]

Unnamed: 0,name,message,timestamp
5838,jinglejingle,"Hi, was wondering if institutions for the MOE ...","13-06-2015, 03:13 PM#2486"
5839,IAmZTX,"jinglejingle wrote: Hi, was wondering if inst...","13-06-2015, 05:02 PM#2487"
12776,marigoldhl,"Apollo97 wrote: Nope, nus dont count mt if ur...","11-06-2016, 08:16 PM#8374"
13311,wei1995,firezero10 wrote: Getting into the course is ...,"31-05-2014, 09:19 PM#3059"
15486,hpixxx,"Hi, may i just enquire for the NTU matriculati...","04-06-2017, 09:01 PM#124"
16575,1blursotong,alleycat96 wrote: Really sorry for posting ag...,"31-05-2016, 09:32 PM#7703"
16577,photobuff,1blursotong wrote: Hi! I'm not a poly graduat...,"31-05-2016, 09:41 PM#7705"
16582,Insertnamehere,1blursotong wrote: Hi! I'm not a poly graduat...,"31-05-2016, 10:05 PM#7710"
31061,Jas786,mpcdude wrote: Hey have you all heard the off...,"20-04-2017, 02:35 PM#3964"
33658,Username01,Hello everyone Im stuck btwn choosing NUS Sci ...,"25-04-2016, 03:56 PM#3201"
