## Imports

In [1]:
import spacy
import textacy
import pandas as pd
import os
import ruamel.yaml as yaml
import datetime
import logging
import sys

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Change to root directory

In [4]:
NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""        

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file    
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)            

## Import local code

In [5]:
# ## Add current wd to path for localimports
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path) 

from src.convenience_functions.textacy_convenience_functions import load_textacy_corpus
from src.convenience_functions.textacy_convenience_functions import entity_statements
from src.convenience_functions.textacy_convenience_functions import list_of_entity_statements
from src.convenience_functions.textacy_convenience_functions import dask_df_apply
from src.textblob_entity_sentiment import textblob_entity_sentiment

## Create log file

In [6]:
now = datetime.datetime.now().strftime("%Y-%m-%d %H-%M")
logging.basicConfig(filename='logs/{}.txt'.format(now), 
                    level=logging.INFO,
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')

## Load Data

In [7]:
logging.info("""Reading in data from {}""".format(cfg['input_filepath']))


df = pd.read_csv(cfg['input_filepath'])

## Dask Multiprocessing of applied textacy docs

Using dask to multiprocess the loading of textacy docs for each text

1. Use dask to create partitioned dataframe

2. To each partition map an apply that creates textacy docs from the Policy_Text column

3. Concatenate back to original df

In [8]:
logging.info("""Creating textacy Doc objects using the text found in the '{}' column""".format(cfg['text_col']))

df = dask_df_apply(df, cfg['text_col'], inplace=True)

ValueError: Metadata inference failed in `lambda`.

Original error is below:
------------------------
FileNotFoundError(2, 'No such file or directory')

Traceback:
---------
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\dask\dataframe\utils.py", line 137, in raise_on_meta_error
    yield
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 3327, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "C:\Users\afurrier\A.Projects\to-git\entity-sentiments\src\convenience_functions\textacy_convenience_functions.py", line 45, in <lambda>
    lambda df : df[text_col].apply(lambda x : textacy.doc.Doc(x, lang='en'))).compute(get=get)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\series.py", line 2551, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/src\inference.pyx", line 1521, in pandas._libs.lib.map_infer
  File "C:\Users\afurrier\A.Projects\to-git\entity-sentiments\src\convenience_functions\textacy_convenience_functions.py", line 45, in <lambda>
    lambda df : df[text_col].apply(lambda x : textacy.doc.Doc(x, lang='en'))).compute(get=get)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\textacy\doc.py", line 114, in __init__
    self._init_from_text(content, metadata, lang)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\textacy\doc.py", line 132, in _init_from_text
    spacy_lang = cache.load_spacy(lang)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\cachetools\__init__.py", line 46, in wrapper
    v = func(*args, **kwargs)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\textacy\cache.py", line 99, in load_spacy
    return spacy.load(name, disable=disable)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\spacy\__init__.py", line 15, in load
    return util.load_model(name, **overrides)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\spacy\util.py", line 112, in load_model
    return load_model_from_link(name, **overrides)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\spacy\util.py", line 126, in load_model_from_link
    cls = import_file(name, path)
  File "C:\Users\afurrier\AppData\Local\Continuum\Anaconda3\lib\site-packages\spacy\compat.py", line 139, in import_file
    spec.loader.exec_module(module)
  File "<frozen importlib._bootstrap_external>", line 674, in exec_module
  File "<frozen importlib._bootstrap_external>", line 780, in get_code
  File "<frozen importlib._bootstrap_external>", line 832, in get_data


## Extracting Entity Text, Counts and Sentiments

#### For each entity selected, return the count of entity occurence as well as mean, min and max of sentiments of sentences that contain said entity

In [7]:
logging.info("""Extracting the following descriptive stats for entity sentiments: {} """.format(cfg['sentiment_descriptive_stats']))

logging.info("""Extracting the sentiments for the following entities: {} """.format(cfg['entities']))

sentiments = [textblob_entity_sentiment(df=df, 
                                        textacy_col='textacy_doc', 
                                        entity=entity, 
                                        inplace=False,
                                        keep_stats=cfg['sentiment_descriptive_stats']) 
              for entity
              in cfg['entities']]
# Concat to single df
sentiments = pd.concat(sentiments, axis=1)

#### Concat sentiment features and original df

In [8]:
texts_with_sentiment_info = pd.concat([df, sentiments], axis=1).drop(labels=['textacy_doc'], axis=1)

In [9]:
texts_with_sentiment_info.columns

Index(['text', 'sentiment_label', 'characters_polarity_count',
       'characters_polarity_mean', 'characters_polarity_min',
       'characters_polarity_25%', 'characters_polarity_50%',
       'characters_polarity_75%', 'characters_polarity_max',
       'plot_polarity_count', 'plot_polarity_mean', 'plot_polarity_min',
       'plot_polarity_25%', 'plot_polarity_50%', 'plot_polarity_75%',
       'plot_polarity_max', 'hero_polarity_count', 'hero_polarity_mean',
       'hero_polarity_min', 'hero_polarity_25%', 'hero_polarity_50%',
       'hero_polarity_75%', 'hero_polarity_max', 'villain_polarity_count',
       'villain_polarity_mean', 'villain_polarity_min', 'villain_polarity_25%',
       'villain_polarity_50%', 'villain_polarity_75%', 'villain_polarity_max'],
      dtype='object')

## Export features

In [10]:
now = datetime.datetime.now().strftime("%Y-%m-%d %H-%M")
archive_output_path = 'output/{}.csv'.format(now)
logging.info("""Outputting sentiments to {}""".format(archive_output_path))
texts_with_sentiment_info.to_csv(archive_output_path, index=False)
print("""Outputting sentiments to {}""".format(archive_output_path))