## Imports

In [1]:
import spacy
import textacy
import pandas as pd
import os
import ruamel.yaml as yaml
import datetime
import logging
import sys

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Change to root directory

In [2]:
NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""        

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file    
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)            

## Import local code

In [3]:
# ## Add current wd to path for localimports
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path) 

from src.convenience_functions.textacy_convenience_functions import load_textacy_corpus
from src.convenience_functions.textacy_convenience_functions import entity_statements
from src.convenience_functions.textacy_convenience_functions import list_of_entity_statements
from src.convenience_functions.textacy_convenience_functions import dask_df_apply
from src.textblob_entity_sentiment import textblob_entity_sentiment

## Create log file

In [4]:
now = datetime.datetime.now().strftime("%Y-%m-%d %H-%M")
logging.basicConfig(filename='logs/{}.txt'.format(now), 
                    level=logging.INFO,
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')

## Load Data

In [5]:
logging.info("""Reading in data from {}""".format(cfg['input_filepath']))


df = pd.read_csv(cfg['input_filepath'])

## Dask Multiprocessing of applied textacy docs

Using dask to multiprocess the loading of textacy docs for each text

1. Use dask to create partitioned dataframe

2. To each partition map an apply that creates textacy docs from the Policy_Text column

3. Concatenate back to original df

In [6]:
logging.info("""Creating textacy Doc objects using the text found in the '{}' column""".format(cfg['text_col']))

df = dask_df_apply(df, cfg['text_col'], inplace=True)

In [8]:
def textblob_entity_sentiment(df,
                              textacy_col,
                              entity,
                              inplace=True,
                              subjectivity=False,
                              keep_stats=['count', 'mean', 'min', '25%', '50%', '75%', 'max']):
    """
    Pull the descriptive sentiment stats of text sentence with a specified entity in it.

    Parameters
    ----------
    df : DataFrame
        Dataframe which holds the text
    textacy_col : str
        The name to give to the column with the textacy doc objects
    entity : str
        The entity to search the textacy Doc object for
    inplace : bool
        Whether to return the entire df with the sentiment info or the sentiment info alone
        Default is False
    subjectivity : bool
        Whether to include the subjectivity of the sentiment. Defaults to False.
    keep_stats : list
        A list of the summary statistics to keep. Default is all returned by pandas DataFrame.describe() method

    Returns
    -------
    DataFrame
        Either the dataframe passed as arg with the sentiment info as trailing columns
        or the sentiment descriptive stats by itself
    """
    sentiment_rows = []
    for text in df[textacy_col].values:
        text_entities = list(entity_statements(text, entity))

         # Iterate through all sentences and get sentiment analysis
        entity_sentiment_info = [textblob.TextBlob(sentence).sentiment_assessments
                                for
                                sentence
                                in
                                text_entities]

        # After taking sentiments, turn into a dataframe and describe
        try:
            # Indices and columns to keep
            #keep_stats = ['count', 'mean', 'min', '25%', '50%', '75%', 'max']
            keep_cols = ['polarity']

            # If subjectivity is set to true, values for it will also be captured
            if subjectivity:
                keep_cols.append('subjectivity')

            # Describe those columns
            summary_stats = pd.DataFrame(entity_sentiment_info).describe().loc[keep_stats, keep_cols]

            # Add row to list
            sentiment_rows.append(pivot_df_to_row(summary_stats))

        # If there's nothing to describe
        except ValueError as e:
            # Create a summary stats with nulls
            summary_stats = pd.DataFrame(index=keep_stats, columns=keep_cols)

            # Add to list of rows
            sentiment_rows.append(pivot_df_to_row(summary_stats))
    # Concatenate All rows together into one dataframe
    sentiment_df = pd.concat(sentiment_rows).add_prefix(entity+'_')

    if not inplace:
        return sentiment_df.reset_index(drop=True)
    else:
        # Return original df with new sentiment attached
        return pd.concat([df, sentiment_df], axis=1)

In [35]:
vaderSentiment.__version__

NameError: name 'vaderSentiment' is not defined

In [16]:
def vader_entity_sentiment(df,
                              textacy_col,
                              entity,
                              inplace=True,
                              vader_sent_types=['neg', 'neu', 'pos', 'compound'],
                              keep_stats=['count', 'mean', 'min', '25%', '50%', '75%', 'max']):
    """
    Pull the descriptive sentiment stats of text sentence with a specified entity in it.

    Parameters
    ----------
    df : DataFrame
        Dataframe which holds the text
    textacy_col : str
        The name to give to the column with the textacy doc objects
    entity : str
        The entity to search the textacy Doc object for
    inplace : bool
        Whether to return the entire df with the sentiment info or the sentiment info alone
        Default is False
    vader_sent_types : list
        The type of sentiment to extract. neg: negative, pos: positive, neu: neutral, compound is 
        comination of all three types of all 
    keep_stats : list
        A list of the summary statistics to keep. Default is all returned by pandas DataFrame.describe() method

    Returns
    -------
    DataFrame
        Either the dataframe passed as arg with the sentiment info as trailing columns
        or the sentiment descriptive stats by itself
    """
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

    vader_analyzer = SentimentIntensityAnalyzer()

    sentiment_rows = []
    for text in df[textacy_col].values:
        text_entities = list(entity_statements(text, entity)) 


         # Iterate through all sentences and get sentiment analysis
        entity_sentiment_info = [vader_analyzer.polarity_scores(sentence)
                                for
                                sentence
                                in
                                text_entities]

        # After taking sentiments, turn into a dataframe and describe
        try:
            # Indices and columns to keep
            keep_stats = keep_stats
            keep_cols = sentiment_types

            # Describe those columns
            summary_stats = pd.DataFrame(entity_sentiment_info).describe().loc[keep_stats, keep_cols]

            # Add row to list
            sentiment_rows.append(pivot_df_to_row(summary_stats))

        # If there's nothing to describe
        except ValueError as e:
            # Create a summary stats with nulls
            summary_stats = pd.DataFrame(index=keep_stats, columns=keep_cols)

            # Add to list of rows
            sentiment_rows.append(pivot_df_to_row(summary_stats))
    # Concatenate All rows together into one dataframe
    sentiment_df = pd.concat(sentiment_rows).add_prefix(entity+'_')

    if not inplace:
        return sentiment_df.reset_index(drop=True)
    else:
        # Return original df with new sentiment attached
        return pd.concat([df, sentiment_df], axis=1)
    
    
    
    
    

# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# vader_analyzer = SentimentIntensityAnalyzer()

# sentiment_rows = []
# for text in df[textacy_col].values:
#     text_entities = list(entity_statements(text, entity)) 
    

#      # Iterate through all sentences and get sentiment analysis
#     entity_sentiment_info = [vader_analyzer.polarity_scores(sentence)
#                             for
#                             sentence
#                             in
#                             text_entities]

#     # After taking sentiments, turn into a dataframe and describe
#     try:
#         # Indices and columns to keep
#         keep_stats = ['count', 'mean', 'min', 'max']
#         keep_cols = ['neg', 'neu', 'pos', 'compound'']

#         # Describe those columns
#         summary_stats = pd.DataFrame(entity_sentiment_info).describe().loc[keep_stats, keep_cols]

#         # Add row to list
#         sentiment_rows.append(pivot_df_to_row(summary_stats))

#     # If there's nothing to describe
#     except ValueError as e:
#         # Create a summary stats with nulls
#         summary_stats = pd.DataFrame(index=keep_stats, columns=keep_cols)

#         # Add to list of rows
#         sentiment_rows.append(pivot_df_to_row(summary_stats))
# # Concatenate All rows together into one dataframe
# sentiment_df = pd.concat(sentiment_rows).add_prefix(entity+'_')

# if not inplace:
#     return sentiment_df.reset_index(drop=True)
# else:
#     # Return original df with new sentiment attached
#     return pd.concat([df, sentiment_df], axis=1)

In [13]:
df.text.values[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [38]:
import vaderSentiment
vaderSentiment.__version__

AttributeError: module 'vaderSentiment' has no attribute '__version__'

In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vader_analyzer = SentimentIntensityAnalyzer()

for sentence in df.text.values[:5]:
    vs = vader_analyzer.polarity_scores(sentence)
    print("{:-<65} {}".format(sentence, str(vs)))

A very, very, very slow-moving, aimless movie about a distressed, drifting young man.   {'neg': 0.219, 'neu': 0.781, 'pos': 0.0, 'compound': -0.4215}
Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.   {'neg': 0.222, 'neu': 0.778, 'pos': 0.0, 'compound': -0.5507}
Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.   {'neg': 0.25, 'neu': 0.667, 'pos': 0.083, 'compound': -0.7178}
Very little music or anything to speak of.  --------------------- {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.   {'neg': 0.0, 'neu': 0.819, 'pos': 0.181, 'compound': 0.6369}


## Extracting Entity Text, Counts and Sentiments

#### For each entity selected, return the count of entity occurence as well as mean, min and max of sentiments of sentences that contain said entity

In [29]:
from src.convenience_functions.pandas_functions import pivot_df_to_row
from src.convenience_functions.pandas_functions import null_column_report
sents = vader_entity_sentiment(df=df,
                       textacy_col='textacy_doc', 
                       entity='characters', 
                       inplace=False,
                       sentiment_types = ['neg', 'neu', 'pos', 'compound'],
                       keep_stats=cfg['sentiment_descriptive_stats'])

In [30]:
null_column_report(sents)

Column:
characters_neg_count
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neg_mean
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neg_min
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neg_25%
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neg_50%
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neg_75%
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neg_max
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neu_count
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neu_mean
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neu_min
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neu_25%
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neu_50%
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neu_75%
Total Nulls:
717
Percent Null:
0.96

Column:
characters_neu_max
Total Nulls:
717
Percent Null:
0.96

Column:
characters_pos_count
Total Nulls:
717
Percent Null:
0.96

Column:
characters_pos_mean
Tota

In [31]:
logging.info("""Extracting the following descriptive stats for entity sentiments: {} """.format(cfg['sentiment_descriptive_stats']))

logging.info("""Extracting the sentiments for the following entities: {} """.format(cfg['entities']))

sentiments = [vader_entity_sentiment(df=df, 
                                        textacy_col='textacy_doc', 
                                        entity=entity, 
                                        inplace=False,
                                        sentiment_types = ['neg', 'neu', 'pos', 'compound'],
                                        keep_stats=cfg['sentiment_descriptive_stats']) 
              for entity
              in cfg['entities']]
# Concat to single df
sentiments = pd.concat(sentiments, axis=1)

#### Concat sentiment features and original df

In [32]:
texts_with_sentiment_info = pd.concat([df, sentiments], axis=1).drop(labels=['textacy_doc'], axis=1)

In [33]:
texts_with_sentiment_info.columns

Index(['text', 'sentiment_label', 'characters_neg_count',
       'characters_neg_mean', 'characters_neg_min', 'characters_neg_25%',
       'characters_neg_50%', 'characters_neg_75%', 'characters_neg_max',
       'characters_neu_count',
       ...
       'villain_pos_50%', 'villain_pos_75%', 'villain_pos_max',
       'villain_compound_count', 'villain_compound_mean',
       'villain_compound_min', 'villain_compound_25%', 'villain_compound_50%',
       'villain_compound_75%', 'villain_compound_max'],
      dtype='object', length=114)

In [34]:
len(texts_with_sentiment_info.columns)

114

## Export features

In [10]:
now = datetime.datetime.now().strftime("%Y-%m-%d %H-%M")
archive_output_path = 'output/{}.csv'.format(now)
logging.info("""Outputting sentiments to {}""".format(archive_output_path))
texts_with_sentiment_info.to_csv(archive_output_path, index=False)
print("""Outputting sentiments to {}""".format(archive_output_path))