In [None]:
import nltk
import numpy
import openpyxl
import pandas as pd
import string

from ipyfilechooser import FileChooser
from nltk.sentiment import SentimentIntensityAnalyzer as sia
from openpyxl.utils.dataframe import dataframe_to_rows as df2r

pd.options.mode.chained_assignment = None # suppress warning

In [None]:
pos_words = pd.read_table('dictionaries/green terminology.txt', header=None)
neg_words = pd.read_table('dictionaries/red terminology.txt', header=None)
neut_words = pd.read_table('dictionaries/white terminology.txt', header=None)

pos_list = []
pos_weight = []
for i in range(0, len(pos_words)):
    if pos_words.iloc[i, 1] != 0: # ignore entries with zero weight
        pos_list.append(pos_words.iloc[i, 0])
        pos_weight.append(pos_words.iloc[i, 1])

neg_list = []
neg_weight = []
for i in range(0, len(neg_words)):
    if neg_words.iloc[i, 1] != 0:
        neg_list.append(neg_words.iloc[i, 0])
        neg_weight.append(neg_words.iloc[i, 1])

neut_list = []
neut_weight = 0
for i in range(0, len(neut_words)):
    neut_list.append(neut_words.iloc[i, 0])

In [None]:
nltk.download(['names', 'stopwords', 'state_union', 'averaged_perceptron_tagger', 'vader_lexicon'], quiet=True)

punct = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
stopwords = nltk.corpus.stopwords.words('english')

sii = sia()

approving = 'strengths of this experience'
critical = 'better learning experience'
additional = 'Nothing further to add'

comment_type = [approving, critical, additional]
comment_weighting = [.975, .975, 1.05] # slightly overweight additional comments

e = .001

medium_cut = .05 # seems to be standard in literature for VADER compound score
high_cut = .12 # guess at reasonable value
adjustment_value = .04 + e # three net positive/negative statements imply high cutoff

very_positive_label = 'Very positive'
very_negative_label = 'Very negative'
positive_label = 'Positive'
negative_label = 'Negative'
neutral_label = 'Neutral'

very_positive_score = high_cut + e
very_negative_score = -1 * very_positive_score
positive_score = medium_cut + e
negative_score = -1 * positive_score
neutral_score = 0

overlong = 300 # words
lengthy = 150
brief = 15

n = numpy.nan

def eval_weighting(w):
    if w < -1 * high_cut:
        return very_negative_label
    elif w < -1 * medium_cut:
        return negative_label
    elif w > high_cut:
        return very_positive_label
    elif w > medium_cut:
        return positive_label
    else:
        return neutral_label

def clear_punctuation(s):
    for e in s:
        if e in punct:
            s = s.replace(e, '')
    return s

def matches(s, t):
    if s.strip().lower() == t.strip().lower():
        return True
    return False

def clean_data(df, cols): # prepare comments for analysis
    for i in cols:
        for j in range(len(df.columns)):
            t = df.iloc[i, j]
            if isinstance(t, str):
                u = clear_punctuation(t.strip().lower())
                if matches(u, 'na'):
                    df.iloc[i, j] = ''
                else:
                    df.iloc[i, j] = u
            else:
                df.iloc[i, j] = ''
    return df

def get_col_index(df, s): # find which column holds a string
    for h in list(df.columns):
        if s in h:
            return df.columns.get_loc(h)

def clean_filename(s):
    f = ''.join(c for c in s if (c.isalnum() or c in '._- '))
    return f

def is_approving_comment(i):
    if comment_type.index(approving) == i:
        return True
    return False

def is_critical_comment(i):
    if comment_type.index(critical) == i:
        return True
    return False

def is_additional_comment(i):
    if comment_type.index(additional) == i:
        return True
    return False

def is_overlong_comment(c):
    return (len(c.split()) > overlong)

def is_lengthy_comment(c):
    return (len(c.split()) > lengthy)

def is_brief_comment(c):
    return (len(c.split()) < brief)

def is_average_length_comment(c):
    return not (is_lengthy_comment(c) or is_brief_comment(c))

def is_very_positive_comment(c, t):
    if is_very_positive_score(get_lexical_semantic_score(c, t)):
        return True
    if is_very_positive_score(get_sentiment_score(c)):
        return True
    return False

def is_very_negative_comment(c, t):
    if is_very_negative_score(get_lexical_semantic_score(c, t)):
        return True
    if is_very_negative_score(get_sentiment_score(c)):
        return True
    return False

def is_positive_comment(c, t):
    if is_positive_score(get_lexical_semantic_score(c, t)):
        return True
    if is_positive_score(get_sentiment_score(c)):
        return True
    return False

def is_negative_comment(c, t):
    if is_negative_score(get_lexical_semantic_score(c, t)):
        return True
    if is_negative_score(get_sentiment_score(c)):
        return True
    return False

def is_at_all_positive_comment(c, t):
    if is_very_positive_comment(c, t):
        return True
    if is_positive_comment(c, t):
        return True
    return False

def is_at_all_negative_comment(c, t):
    if is_very_negative_comment(c, t):
        return True
    if is_negative_comment(c, t):
        return True
    return False

def is_neutral_comment(c, t):
    for u in neut_list:
        if matches(u, c):
            return True
    if is_neutral_score(get_lexical_semantic_score(c, t)):
        return True
    if is_neutral_score(get_sentiment_score(c)):
        return True
    return False

def get_lexical_semantic_score(s, t): # s is the comment, t an indicator of type of comment
    w = 0
    for i, p in enumerate(pos_list):
        if p in s:
            w = min(1, w + adjustment_value * pos_weight[i] * comment_weighting[t])
    for j, n in enumerate(neg_list):
        if n in s:
            w = max(-1, w - adjustment_value * neg_weight[j] * comment_weighting[t])
    return w

def get_sentiment_score(s):
    return sii.polarity_scores(s)['compound']

def get_score(s, s_):
    if is_very_negative_comment(s, s_):
        return very_negative_score
    elif is_negative_comment(s, s_):
        return negative_score
    elif is_very_positive_comment(s, s_):
        return very_positive_score
    elif is_positive_comment(s, s_):
        return positive_score
    else:
        return neutral_score

def is_very_positive_score(s):
    if matches(eval_weighting(s), very_positive_label):
        return True
    return False

def is_very_negative_score(s):
    if matches(eval_weighting(s), very_negative_label):
        return True
    return False

def is_positive_score(s):
    if matches(eval_weighting(s), positive_label):
        return True
    return False

def is_negative_score(s):
    if matches(eval_weighting(s), negative_label):
        return True
    return False

def is_neutral_score(s):
    if matches(eval_weighting(s), neutral_label):
        return True
    return False

In [None]:
fc = FileChooser('data')
display(fc)

In [None]:
df = pd.read_excel(fc.selected)

cols_of_interest = []

for i in range(0, len(comment_type)):
    cols_of_interest.append(get_col_index(df, comment_type[i]))

df = clean_data(df, cols_of_interest)

In [None]:
# for training data
cols_of_interest.append(get_col_index(df, 'Rating'))

In [None]:
# for test data
cols_of_interest.append(get_col_index(df, 'Quality of Interactions during Experience AVG'))
cols_of_interest.append(get_col_index(df, 'Quality of Preceptor/Preceptor Team AVG'))
cols_of_interest.append(get_col_index(df, 'Quality of Site AVG'))

In [None]:
df2 = df.iloc[:, cols_of_interest]

In [None]:
# debug number of cases
cases = [0] * 34

In [None]:
def calculate_score(c):

    # consider, as needed, individually or in combination
    #  lexical/syntactic
    #  NL-based scores
    #  lengths of approving, critical, and additional comments

    a = ''
    x = ''
    d = ''

    a_ = 0
    x_ = 1
    d_ = 2

    for i in range(0, len(c)):
        if is_approving_comment(i):
            if isinstance(c[i], str):
                a = clear_punctuation(c[i])
                a_ = i
        elif is_critical_comment(i):
            if isinstance(c[i], str):
                x = clear_punctuation(c[i])
                x_ = i
        elif is_additional_comment(i):
            if isinstance(c[i], str):
                d = clear_punctuation(c[i])
                d_ = i

    j = -1

    # logic provided in logic/calculate_scores.xlsm

    # Case 15
    if is_overlong_comment(x):
        j = 15
        r = very_negative_score

    # Case 17
    elif is_brief_comment(d) and is_very_negative_comment(d, d_):
        j = 17
        r = get_score(d, d_) # very_negative_score

    # Case 19
    elif is_average_length_comment(a) and (is_overlong_comment(x + d) or (is_lengthy_comment(x) and is_lengthy_comment(d))):
        j = 19
        r = very_negative_score

    elif is_average_length_comment(a) and not is_lengthy_comment(x) and not is_lengthy_comment(d):

        # Case 1
        if is_very_positive_comment(a, a_) and not is_at_all_negative_comment(x, x_) and not is_at_all_negative_comment(d, d_):
            j = 1
            r = get_score(a, a_) # very_positive_score

        # Case 2
        elif is_positive_comment(a, a_) and not is_at_all_negative_comment(x, x_) and is_very_positive_comment(d, d_):
            j = 2
            r = get_score(d, d_) # very_positive_score

        # Case 3
        elif is_positive_comment(a, a_) and not is_at_all_negative_comment(x, x_) and not is_at_all_negative_comment(d, d_):
            j = 3
            r = get_score(a, a_) # positive_score

        # Case 4
        elif not is_at_all_positive_comment(a, a_) and is_very_negative_comment(x, x_) and not is_at_all_positive_comment(d, d_):
            j = 4
            r = get_score(x, x_) # very_negative_score

        # Case 5
        elif not is_at_all_positive_comment(a, a_) and not is_at_all_positive_comment(x, x_) and is_very_negative_comment(d, d_):
            j = 5
            r = get_score(d, d_) # very_negative_score

        # Case 6
        elif not is_at_all_positive_comment(a, a_) and is_negative_comment(x, x_) and is_negative_comment(d, d_):
            j = 6
            r = very_negative_score

        # Case 7
        elif is_at_all_positive_comment(a, a_) and is_very_negative_comment(x, x_) and is_very_negative_comment(d, d_):
            j = 7
            r = negative_score

        # Case 8
        elif is_positive_comment(a, a_) and is_at_all_negative_comment(x, x_) and is_at_all_negative_comment(d, d_):
            j = 8
            r = negative_score

        # Case 9
        elif is_at_all_positive_comment(a, a_) and is_negative_comment(x, x_) and is_at_all_positive_comment(d, d_):
            j = 9
            r = positive_score

        # Case 10
        elif is_neutral_comment(a, a_) and is_neutral_comment(x, x_):
            j = 10
            r = get_score(d, d_)

        # Case 11
        elif is_neutral_comment(a, a_) and is_at_all_negative_comment(x, x_) and is_neutral_comment(d, d_):
            j = 11
            r = get_score(x, x_)

        # Case 12
        elif is_very_positive_comment(a, a_) and is_negative_comment(x, x_) and is_neutral_comment(d, d_):
            j = 12
            r = positive_score

        # Case 13
        elif is_positive_comment(a, a_) and is_neutral_comment(x, x_) and is_very_negative_comment(d, d_):
            j = 13
            r = negative_score

        # Case 14
        elif is_neutral_comment(a, a_) and is_negative_comment(x, x_) and is_very_positive_comment(d, d_):
            j = 14
            r = positive_score

        # Case 21
        elif is_at_all_positive_comment(a, a_) and is_neutral_comment(x, x_) and is_at_all_positive_comment(d, d_):
            j = 21
            r = very_positive_score

    elif is_lengthy_comment(x):

        # Case 20
        if is_overlong_comment(a) and not is_lengthy_comment(d):
            j = 20
            r = positive_score

        # Case 22
        elif is_at_all_positive_comment(a, a_) and is_neutral_comment(x, x_) and is_at_all_positive_comment(d, d_):
            j = 22
            r = positive_score

        # Case 23
        elif is_brief_comment(a) and is_lengthy_comment(d):
            j = 23
            r = negative_score

        # Case 30
        elif is_neutral_comment(a, a_) and is_at_all_negative_comment(x, x_) and is_negative_comment(d, d_):
            j = 30
            r = very_negative_score

    # Case 18
    elif is_lengthy_comment(a) and (is_overlong_comment(x + d) or (is_lengthy_comment(x) and is_lengthy_comment(d))):
        j = 18
        r = negative_score

    elif is_neutral_comment(a, a_):

        # Case 27
        if is_neutral_comment(x, x_) and is_at_all_positive_comment(d, d_):
            j = 27
            r = get_score(d, d_)

        # Case 28
        elif is_neutral_comment(x, x_) and is_at_all_negative_comment(d, d_):
            j = 28
            r = get_score(d, d_)

        # Case 29
        elif is_at_all_negative_comment(x, x_) and is_neutral_comment(d, d_):
            j = 29
            r = get_score(x, x_)

        # Case 31
        elif is_negative_comment(x, x_) and not is_lengthy_comment(x) and is_negative_comment(d, d_):
            j = 31
            r = get_score(x, x_) # negative_score, or get_score(d, d_)

        # Case 32
        elif is_negative_comment(x, x_) and is_very_negative_comment(d, d_):
            j = 32
            r = get_score(d, d_) # very_negative_score

        # Case 33
        elif is_very_negative_comment(x, x_) and is_at_all_negative_comment(d, d_):
            j = 33
            r = get_score(x, x_) # very_negative_score

    elif is_neutral_comment(x, x_):

        # Case 24
        elif is_negative_comment(a, a_) and is_negative_comment(d, d_):
            j = 24
            r = get_score(a, a_) # negative_score

        # Case 25
        elif is_at_all_positive_comment(a, a_) and is_brief_comment(x) and is_neutral_comment(d, d_):
            j = 25
            r = get_score(a, a_)

    # Case 26
    elif is_very_positive_comment(a, a_) and is_negative_comment(x, x_) and is_brief_comment(x) and is_at_all_positive_comment(d, d_):
        j = 26
        r = positive_score

    # Case 16
    elif is_overlong_comment(a) and (is_overlong_comment(x + d) or (is_lengthy_comment(x) and is_lengthy_comment(d))):
        j = 16
        r = neutral_score

    if j < 0:
        j = 0
        r = neutral_score # by default

    cases[j] += 1
    return r

In [None]:
weights = []
evals = []

for i in range(0, len(df2)): # loop for every row
    w = calculate_score(df2.iloc[i])
    weights.append(w)
    evals.append(eval_weighting(w))

In [None]:
# debug number of cases
print(*cases)

In [None]:
f = input("What is the name of the output file? ")
f = clean_filename(f)

In [None]:
# for test data
cols_of_interest.append(get_col_index(df, 'Quality of Interactions during Experience AVG'))
cols_of_interest.append(get_col_index(df, 'Quality of Preceptor/Preceptor Team AVG'))
cols_of_interest.append(get_col_index(df, 'Quality of Site AVG'))

In [None]:
#testing
for i in range(len(df2)):
    if isinstance(df2.iloc[i,1], str):
        if 'bedtime' in df2.iloc[i,1]:
            print(get_score(df2.iloc[i,1],1))

In [None]:
# for test data
df2['Overall Quality'] = df.iloc[:, cols_of_interest[-3:]].sum(axis=1)
df2[df2['Overall Quality'].eq('')] = n # may be missing data

In [None]:
df2['Semantic Value'] = weights
df2['Semantic Evaluation'] = evals

wb = openpyxl.Workbook()
ws = wb.active
rows = df2r(df2, index=False)

for i, row in enumerate(rows, 1):
    for j, val in enumerate(row, 1):
         ws.cell(row=i, column=j, value=val)

wb.save('data/' + f + '.xlsx')