In [5]:
import re
import csv
import pandas as pd

from afinn import Afinn
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner'])

In [None]:
# add the preprocessing script: https://github.com/stephbuon/democracy-lab/issues/75
# create a copy of these property words and use them: https://docs.google.com/spreadsheets/d/1J1UofUSAeaOM0eWo8_1FbwgFHcris2SGZLH4XXXk21A/edit#gid=2100970163

In [26]:
def list_contains(ls, keywords_list):
    filtered_list = []
    
    for string in ls:
        for keyword in keywords_list:
            if keyword in string:
                filtered_list.append(string)
                
    return filtered_list


def import_data(data, sep, col_name):
    hansard = pd.read_csv(data, sep=sep, usecols=[col_name])
    debate_text = hansard[col_name].tolist()
    return debate_text


def grammatical_collocates(ls, keywords_list, **kwargs):
    return_type = kwargs.get('return_type', None)
    
    if type(keywords_list) != list:
        print('Error: keywords_list must be a list')
        return None
        
    regex = re.compile('|'.join(keywords_list))
    
    collocates = []
    
    for string in ls:
        doc = nlp(string)
        
        for token in doc:
            if regex.match(token.text):
                print(token, regex)
                col = str(token.text) + ' ' + str(token.head.text)
                collocates.append(str(token.text) + ' ' + str(token.head.text))
                
                for child in token.children:
                    collocates.append(str(token.text) + ' ' + str(child))
                    
    if return_type == 'ls':
        return collocates
    if return_type == 'df':
        return pd.DataFrame(collocates, columns =['grammatical_collocates'])
    
    
def afinn_sentiment(text):
    return Afinn().score(text)


def textblob_sentiment(text):
    return TextBlob(text).sentiment.polarity


def vader_sentiment(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)


def sentiment_score(df, col_name):
    df['afinn'] = df[col_name].apply(afinn_sentiment)
    df['textblob'] = df[col_name].apply(textblob_sentiment)
    df['vader'] = df[col_name].apply(vader_sentiment)
    df['vader'] = df['vader'].apply(lambda score_dict: score_dict['compound'])    
    return df

def m(fname):
    data = import_data(fname, ',', 'text')
    
    # import keywords_list
    
    data = list_contains(data, keywords_list)
    
    # for keyword in property_keywords:
    
    collocates = grammatical_collocates(data, keyword, return_type='df')
    
    collocates_and_sentiment = sentiment_score(collocates, 'grammatical_collocates')
    
    collocates_and_sentiemnt.to_csv('collocates_sentiment_scores.csv', index=False)

In [6]:
out = import_data('/users/sbuongiorno/hansard_justnine_w_year.csv', ',', 'text')

In [8]:
test = out[:15]

In [10]:
out = list_contains(test, ['respect', ' it '])

In [31]:
save = grammatical_collocates(out, ['he'], return_type='df')

he re.compile('he')
he re.compile('he')


In [32]:
save

Unnamed: 0,grammatical_collocates
0,he begged
1,he begged


In [33]:
test = sentiment_score(save, 'grammatical_collocates')

In [34]:
test

Unnamed: 0,grammatical_collocates,afinn,textblob,vader
0,he begged,0.0,0.0,0.0
1,he begged,0.0,0.0,0.0
