In [90]:
import argparse

import re
import os
import csv 
import pandas as pd

from afinn import Afinn
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner'])

In [102]:
def list_contains(ls, keywords_list):
    filtered_list = []
    
    for string in ls:
        for keyword in keywords_list:
            if keyword in string:
                filtered_list.append(string)
                
    return filtered_list


def standardize_spelling_df(data, col_name, fpath_replace_list):
    with open(fpath_replace_list, 'r') as f:
        csv_reader = csv.reader(f)
        replace_list = list(csv_reader)
    
    for replace in replace_list:
        data[col_name] = data[col_name].str.replace('\\b' + '(?i)' + replace[0] + '\\b', replace[1])
    
    return data


def import_data(data, sep, col_name, **kwargs):
    preprocess_spelling = kwargs.get('preprocess_spelling', None)
    fpath_replace_list = kwargs.get('fpath_replace_list', None)
    
    hansard = pd.read_csv(data, sep=sep, usecols=[col_name])
    
    if preprocess_spelling is not None:
        hansard = standardize_spelling_df(hansard, col_name, fpath_replace_list)
    
    debate_text = hansard[col_name].tolist()
    return debate_text


def grammatical_collocates(ls, keywords_list, **kwargs):
    return_type = kwargs.get('return_type', None)
    
    if type(keywords_list) != list:
        raise TypeError('keywords_list must be a list.')
        
    regex = re.compile('|'.join(keywords_list))
    
    collocates = []
    
    for string in ls:
        doc = nlp(string)
        
        for token in doc:
            if regex.match(token.text):
                col = str(token.text) + ' ' + str(token.head.text)
                collocates.append(str(token.text) + ' ' + str(token.head.text))
                
                for child in token.children:
                    collocates.append(str(token.text) + ' ' + str(child))
                    
    if return_type == 'ls':
        return collocates
    if return_type == 'df':
        return pd.DataFrame(collocates, columns =['grammatical_collocates'])
    
    
def afinn_sentiment(text):
    return Afinn().score(text)


def textblob_sentiment(text):
    return TextBlob(text).sentiment.polarity


def vader_sentiment(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)


def sentiment_score(df, col_name):
    df['afinn'] = df[col_name].apply(afinn_sentiment)
    df['textblob'] = df[col_name].apply(textblob_sentiment)
    df['vader'] = df[col_name].apply(vader_sentiment)
    df['vader'] = df['vader'].apply(lambda score_dict: score_dict['compound'])    
    return df


class commandline:
    
    def argument_parser():
        
        parser = argparse.ArgumentParser(description = "For sentiments of grammatical collocates.")
        
        parser.add_argument('--data', help = "Name of data.", required = True, default = '')
        parser.add_argument('--sep', help = "Delimiter.", required = True, default = '')
        parser.add_argument('--col_name', help = 'Name of column from which collocates will be extracted.', required = True, default = '')
        parser.add_argument('--keywords_list', help = "List of keywords to guide collocate extraction.", required = True, default = '')
        parser.add_argument('--preprocess_spelling', help = "For standardizing spelling.", required = False, default = '')
        parser.add_argument('--fpath_replace_list', help = "Name of spelling standardization file.", required = False, default = '')
        
        argument = parser.parse_args()
        
        return argument
        


In [122]:
if __name__ == '__main__':
    try:
        cli = commandline.argument_parser()
        
        input_file = cli.data
        sep = cli.sep # fix this so it autodetects
        col_name = cli.col_name
        keywords_list = cli.keywords_list
        ps = cli.preprocess_spelling
        f = cli.fpath_replace_list
        
    except IndexError:
        exit('Check commandline arguments')

    export_folder = 'collocates_sentiment'

    if not os.path.exists(export_folder):
        os.mkdir(export_folder)
        
    keywords_list = pd.read_csv(keywords_list)
    kw_col_name = keywords_list.columns[0]
    keywords_list = keywords_list[kw_col_name].tolist()
    
    if ps is not None:
        data = import_data(input_file, sep, col_name, preprocess_spelling=ps, fpath_replace_list=f)
    else:
        data = import_data(input_file, sep, col_name)
    
    data = list_contains(data, keywords_list)
    
    data = grammatical_collocates(data, keywords_list, return_type='df') # add a cli option for return type 
    
    data = sentiment_score(data, 'grammatical_collocates')
    
    data.to_csv(export_folder + '/' + 'collocates_sentiment_scores.csv', index=False)

NameError: name 'commandline' is not defined

In [128]:
!jupyter nbconvert --to script Untitled2.ipynb

[NbConvertApp] Converting notebook Untitled2.ipynb to script
[NbConvertApp] Writing 5557 bytes to Untitled2.py


In [103]:
#out = import_data('/users/sbuongiorno/hansard_justnine_w_year.csv', ',', 'text', preprocess_spelling=True, fpath_replace_list='/users/sbuongiorno/preprocess_propertywords.csv')

In [12]:
#out = list_contains(test, ['respect', ' it '])

In [20]:
#save = grammatical_collocates(out, 'he', return_type='df')

TypeError: keywords_list must be a list.

In [33]:
#test = sentiment_score(save, 'grammatical_collocates')