In [None]:
import nltk
import pandas
import string
import xlwt

from nltk.sentiment import SentimentIntensityAnalyzer as sia

In [None]:
nltk.download(['names', 'stopwords', 'state_union', 'averaged_perceptron_tagger', 'vader_lexicon'])

punct = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
medium_cut = .05 # seems to be standard in literature for VADER compound score
high_cut = .2 # guess at reasonable value
adjustment_value = .067 # more than three net positive/negative statements imply high cutoff

def eval_weighting(w):
    if w < -1 * high_cut:
        return 'very negative'
    elif w < -1 * medium_cut:
        return 'negative'
    elif w > high_cut:
        return 'very positive'
    elif w > medium_cut:
        return 'positive'
    else:
        return 'neutral'

In [None]:
def clear_punctuation(s):
    for e in s:
        if e in punct:
            s = s.replace(e, '')
    return s

In [None]:
def clean_data(df, cols):
    for i in cols:
        for j in range(len(df.columns)):
            t = df.iloc[i, j]
            if isinstance(t, str):
                u = clear_punctuation(t.lower())
                if u == 'na':
                    df.iloc[i, j] = ''
                else:
                    df.iloc[i, j] = u
            else:
                df.iloc[i, j] = ''
    return df

In [None]:
def get_col_index(df, s):
    for h in list(df.columns):
        if s in h:
            return df.columns.get_loc(h)

In [None]:
pos_words = pandas.read_table('dictionaries/green terminology.txt', header=None)
neg_words = pandas.read_table('dictionaries/red terminology.txt', header=None)

pos_list = []
for i in range(0, len(pos_words)):
    pos_list.append(pos_words.iloc[i, 0])

neg_list = []
for i in range(0, len(neg_words)):
    neg_list.append(neg_words.iloc[i, 0])

In [None]:
df = pandas.read_excel('data/21-22 Student evaluation of site (Qualitative and Quantitative).xlsx')

cols_of_interest = [
    get_col_index(df, 'strengths of this experience'),
    get_col_index(df, 'better learning experience'),
    get_col_index(df, 'Nothing further to add')]

df = clean_data(df, cols_of_interest)

In [None]:
sii = sia()
weights = []

for i in range(0, len(df)):

    w = 0

    for j in cols_of_interest:

        if isinstance(df.iloc[i, j], str):

            test_str = clear_punctuation(df.iloc[i, j])
            # words = [w for w in test_str.split() if w.isalpha() and w.lower() not in stopwords]
            r = 0

            for p in pos_list:
                if p in test_str:
                    r = min(1, r + adjustment_value)

            for n in neg_list:
                if n in test_str:
                    r = max(-1, r - adjustment_value)

            if r == 0:
                r = sii.polarity_scores(test_str)['compound']

            w = w + r

    weights.append(eval_weighting(w / len(cols_of_interest)))

In [None]:
df['Semantic value'] = weights
df.to_excel('data/tmp.xls')