In [3]:
import argparse

import re
import os
import sys
import csv 
import pandas as pd

from afinn import Afinn
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner'])



In [53]:
def is_notebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter


#def list_contains(ls, keywords_list):
#    filtered_list = []
    
#    for string in ls:
#        for keyword in keywords_list:
#            if keyword in string:
#                filtered_list.append(string)
                
#    return filtered_list

def data_contains(dic, keywords_list):
    filtered_dic = {}
    
    regex = re.compile('|'.join(keywords_list))
    
    for key, value in dic.items():
        for sentence in value:
            if regex.search(sentence):
                if key in filtered_dic:
                    filtered_dic[key].append(sentence)
                else:
                    filtered_dic[key] = [sentence]
    
    return filtered_dic


def standardize_spelling_df(data, text_col, fpath_replace_list):
    with open(fpath_replace_list, 'r') as f:
        csv_reader = csv.reader(f)
        replace_list = list(csv_reader)
        
    data[text_col] = data[text_col].str.lower()
    
    for replace in replace_list:
        data[text_col] = data[text_col].str.replace('\\b' + '(?i)' + replace[0] + '\\b', replace[1])
    
    return data


def import_data(fpath, sep, text_col, year_col, **kwargs): #col_name, **kwargs):
    preprocess_spelling = kwargs.get('preprocess_spelling', None)
    fpath_replace_list = kwargs.get('fpath_replace_list', None)
    
    #data = pd.read_csv(fpath, sep=sep, usecols=[col_name])
    
    data = pd.read_csv(fpath, sep=sep)
    
    data = data[[text_col, year_col]]
    
    if preprocess_spelling is not None:
        data = standardize_spelling_df(data, text_col, fpath_replace_list)

    
    year = []
    text = []

    for index, row in data.iterrows():
        year.append(row['year'])
        text.append(row['text'])
        
    year_text_list = list(zip(year, text))
    
    year_text_dict = {}
    
    for (tup1, tup2) in year_text_list:
        if tup1 in year_text_dict:
            year_text_dict[tup1].append(tup2)
        else:
            year_text_dict[tup1] = [tup2]
            
    year_text_dict = {str(key): value for key, value in year_text_dict.items()}

    #return col
    return year_text_dict


# def grammatical_collocates(ls, keywords_list, **kwargs):
#     return_type = kwargs.get('return_type', None)
    
#     if type(keywords_list) != list:
#         raise TypeError('keywords_list must be a list.')
        
#     regex = re.compile('|'.join(keywords_list))
    
#     collocates = []
    
#     for string in ls:
#         doc = nlp(string)
        
#         for token in doc:
#             if regex.match(token.text):
#                 col = str(token.text) + ' ' + str(token.head.text)
#                 collocates.append(str(token.text) + ' ' + str(token.head.text))
                
#                 for child in token.children:
#                     collocates.append(str(token.text) + ' ' + str(child))
                    
#     if return_type == 'ls':
#         return collocates
#     if return_type == 'df':
#         return pd.DataFrame(collocates, columns =['grammatical_collocates'])
    


def grammatical_collocates(dic, keywords_list, **kwargs):
    
    if type(keywords_list) != list:
        raise TypeError('keywords_list must be a list.')
        
    regex = re.compile('|'.join(keywords_list))
    
    all_collocates_df = pd.DataFrame()
    
    for key, value in dic.items():
        year = key
        
        for sentence in value:
            doc = nlp(sentence)
            
            collocates = []
            for token in doc:
                if regex.match(token.text):
                    if token.text != token.head.text:
                        collocates.append(str(token.text) + ' ' + str(token.head.text))
                
                    for child in token.children:
                        collocates.append(str(token.text) + ' ' + str(child))
                        
            collocates_df = pd.DataFrame(collocates, columns=['grammatical_collocates'])
            collocates_df['year'] = year
            
            all_collocates_df = pd.concat([all_collocates_df, collocates_df], axis=0)

    return all_collocates_df

def afinn_sentiment(text):
    return Afinn().score(text)


def textblob_sentiment(text):
    return TextBlob(text).sentiment.polarity


def vader_sentiment(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)


def sentiment_score(df, col_name):
    df['afinn'] = df[col_name].apply(afinn_sentiment)
    df['textblob'] = df[col_name].apply(textblob_sentiment)
    df['vader'] = df[col_name].apply(vader_sentiment)
    df['vader'] = df['vader'].apply(lambda score_dict: score_dict['compound'])    
    return df


class commandline: # need to update this to reflect current config 
    
    def argument_parser():
        
        parser = argparse.ArgumentParser(description = "For sentiments of grammatical collocates.")
        
        parser.add_argument('--data', help = "Name of data.", required = True, default = '')
        parser.add_argument('--sep', help = "Delimiter.", required = True, default = '')
        parser.add_argument('--col_name', help = 'Name of column from which collocates will be extracted.', required = True, default = '')
        parser.add_argument('--keywords_list', help = "List of keywords to guide collocate extraction.", required = True, default = '')
        parser.add_argument('--preprocess_spelling', help = "For standardizing spelling.", required = False, default = '')
        parser.add_argument('--fpath_replace_list', help = "Name of spelling standardization file.", required = False, default = '')
        
        argument = parser.parse_args()
        
        return argument
        


## Interactive Lab:

In [6]:
notebook = is_notebook()

if notebook == True:
    keywords_list = pd.read_csv('propertywords_cleaned_for_collocates.csv')
    kw_col_name = keywords_list.columns[0]
    keywords_list = keywords_list[kw_col_name].tolist()

    # can't use usecols here bc text is str andn year is int -- dumb 
    data = import_data('/users/sbuongiorno/rerun_csv_chunk.csv', ',', text_col='text', year_col='year', preprocess_spelling=True, fpath_replace_list='/users/sbuongiorno/preprocess_propertywords.csv')

    data = data_contains(data, keywords_list)
    
    data = grammatical_collocates(data, keywords_list) # add a cli option for return type 
    
    data = sentiment_score(data, 'grammatical_collocates')
    
    export_folder = 'collocates_sentiment'
    
    if not os.path.exists(export_folder):
        os.mkdir(export_folder)
    
    data.to_csv(export_folder + '/' + 'collocates_sentiment_scores.csv', index=False)

## CLI:

In [55]:
# have to update this to match notebook section -- specifically, i added col_use = text and year 
    
#     if __name__ == '__main__':
#         try:
#             cli = commandline.argument_parser()
        
#             input_file = cli.data
#             sep = cli.sep # fix this so it autodetects
#             col_name = cli.col_name
#             keywords_list = cli.keywords_list
#             ps = cli.preprocess_spelling
#             f = cli.fpath_replace_list
        
#         except IndexError:
#             exit('Check commandline arguments')

#         export_folder = 'collocates_sentiment'

#         if not os.path.exists(export_folder):
#             os.mkdir(export_folder)
        
#         keywords_list = pd.read_csv(keywords_list)
#         kw_col_name = keywords_list.columns[0]
#         keywords_list = keywords_list[kw_col_name].tolist()
    
#         if ps is not None:
#             data = import_data(input_file, sep, col_name, preprocess_spelling=ps, fpath_replace_list=f)
#         else:
#             data = import_data(input_file, sep, col_name)
    
#             data = list_contains(data, keywords_list)
    
#             data = grammatical_collocates(data, keywords_list, return_type='df') # add a cli option for return type 
    
#             data = sentiment_score(data, 'grammatical_collocates')
    
#             data.to_csv(export_folder + '/' + 'collocates_sentiment_scores.csv', index=False)

## sbatch:

In [65]:
keywords_list = ['absentee', 'adscription', 'agist', 'allod', 'allotment', 'almoign', 'amortized', 'apanage', 'atar', 'attorn', 
                 'blettro', 'bond-land', 'bookland', 'bordage', 'bordar', 'boscage', 'burgage', 'burgery', 'cablicum', 
                 'cammandery', 'cartbote', 'charter-hold', 'charter-land', 'chattel', 'co-feof', 'co-tenancy', 'co-tenant', 
                 'coedcae', 'commonage', 'commonties', 'commonty', 'croft', 'curiality', 'demesn', 'depopulation', 'dispossess',
                 'domanial', 'domesday', 'dreng', 'eject', 'emin', 'enclosure','escuage', 'esplees', 'estover', 'ethel', 'evict',
                 'feu', 'ffridd', 'fiar', 'fief', 'firebote', 'fogg', 'folkland', 'frank-ferm', 'frank-marriage', 'frankalmoign',
                 'franklin', 'frith', 'gavelkind', 'gaveller', 'grasanese', 'gwely', 'haybote', 'herbage', 'holdership', 
                 'homager', 'householdership', 'hypothec', 'inclosure', 'intercommoning', 'joint-tenancy', 'joint-tenant', 
                 'knight-service', 'laen', 'laetic', 'land', 'lease', 'lessee', 'lifehold', 'liferent', 'livier', 'lotment', 
                 'mail-payer', 'majorat', 'manorialize', 'manurance', 'mese', 'mesn', 'metayage', 'metayer', 'mivvy', 'occupance',
                 'occupancy', 'outland', 'pannage', 'parage', 'patrony', 'pendicle', 'perpetual', 'piscary', 'ploughbote', 
                 'poffle', 'pollam', 'pre-emptive', 'property', 'rack-rent', 'radknight', 'radman', 'rent', 'rere-fief', 'roture',
                 'roturier', 'rundale', 'runrig', 'ryoti', 'ryotwar', 'scattald', 'seisin', 'severalty', 'socage', 'sokeman', 
                 'solidate', 'sorning', 'sple', 'squat', 'steelbow', 'sub-fief', 'subaltern', 'subfeu', 'sublessee', 'subsman', 
                 'subtenancy', 'subtenant', 'subvassal', 'suit-hold', 'swinamote', 'tanistic', 'tanistry', 'tariot', 'tenancy',
                 'tenant', 'tenement', 'tenurial', 'termon', 'termor', 'terre-tenant', 'thanage', 'thaneland', 'three-life',
                 'thring', 'turbary', 'udal', 'under-tenancy', 'under-tenant', 'underlessee', 'undertenant', 'undervassal',
                 'unfeued', 'unleased', 'unlet', 'urbarial', 'vassal', 'venville', 'vidame', 'villan', 'villar', 'villein', 
                 'woodmote', 'zemindar']

In [None]:
import pickle

arg1 = sys.argv[1]

data = import_data(arg1, ',', text_col='text', year_col='year', preprocess_spelling=True, fpath_replace_list='/users/sbuongiorno/preprocess_propertywords.csv')

data = data_contains(data, keywords_list)
    
data = grammatical_collocates(data, keywords_list, return_type='df') # add a cli option for return type 

data = sentiment_score(data, 'grammatical_collocates')

if not data.empty:
    handle = open(arg1 + '.pickle', 'wb') 
    pickle.dump(data, handle)
else:
    exit()

In [60]:
!jupyter nbconvert --to script collocates_sentiment.ipynb

[NbConvertApp] Converting notebook collocates_sentiment.ipynb to script
[NbConvertApp] Writing 11853 bytes to collocates_sentiment.py
