# Sentiment detection with spaCy

## Preparations
Various settings:

In [1]:
tellers_db_path = '/tmp/tellers.db'
lexicon_csv_path = '/home/roskakori/workspace/sel_thesis/tellersdwh/lexicon_de.csv'

Import various modules we are going to need:

In [2]:
# Python standard library
import re
import sqlite3
from contextlib import closing

# SpaCy
import spacy

## Read feedbacks from database

Connect to the database with feedback texts:

In [3]:
connection = sqlite3.connect('/tmp/tellers.db')

Class to store a feedback and the related question:

In [4]:
class Feedback():
    def __init__(self, question_id: int, question: str, feedback_id: int, feedback: str):
        self.question_id = question_id
        self.question = question
        self.feedback_id = feedback_id
        self.feedback = feedback

Read the feedback documents, where each feedback can consist of multiple sentences:

In [5]:
select_feedback_sql = """
    select
        que.question_id,
        que.text,
        fdb.feedback_id,
        fdb.text
    from
        feedback as fdb
        join source as src on
            src.source_id = fdb.source_id
        join question as que on
            que.question_id = fdb.question_id
    where 1 = 1
        and fdb.feedback_time >= '2017-10-01'
        and src.name = 'tellers'
"""

with closing(connection.cursor()) as cursor:
    feedbacks = [
        Feedback(question_id, question,feedback_id, feedback)
        for question_id, question, feedback_id, feedback in cursor.execute(select_feedback_sql)
    ]
print('found {} feedback documents'.format(len(feedbacks)))    

found 298 feedback documents


## Cleanup texts for further processing
Replace certain abbrevisations that would confuse spaCy when detection sentence borders:

In [6]:
ABBREVIATOION_TO_EXPANDED_MAP = {
    'ca': 'circa',
    'ev': 'eventuell',
    'max': 'maximal',
    'vlt': 'vielleicht',
}

for feedback in feedbacks:
    for abbreviation, expanded in ABBREVIATOION_TO_EXPANDED_MAP.items():
        # TODO: Use compiled regex.
        feedback.feedback = re.sub(
            r'\W' + abbreviation + r'\.', 
            expanded + ' ',
            feedback.feedback)

Build a map of emojis (both western and eastern) to a distinct text form:

In [7]:
_EMOJI_PREFIX = 'emoji__'
_EMOJI_TO_NAME_MAP = {
    # Western
    ':)': 'slight_smile',
    ':-)': 'slight_smile',
    '=)': 'slight_smile',
    ':(': 'slight_frown',
    ':-(': 'slight_frown',
    ':D': 'smile',
    ':-D': 'smile',
    ':P': 'stuck_out_tongue',
    ':-P': 'stuck_out_tongue',
    ';)': 'wink',
    ';-)': 'wink',
    # Eastern
    '^^': 'slight_smile',
    '^_^': 'slight_smile',
}
_EMOJI_TO_TEXT_MAP = {
    emoji: ' ' + _EMOJI_PREFIX + name + ' '
    for emoji, name in _EMOJI_TO_NAME_MAP.items()
}

Replace emojis by text:

In [8]:
for feedback in feedbacks:
    for emoji, emoji_text in _EMOJI_TO_TEXT_MAP.items():
        feedback.feedback = feedback.feedback.replace(emoji, emoji_text)

## Split into sentences
Definde a class to hold a single opinion:

In [9]:
class Opinion():
    def __init__(self, feedback_id, sentence_nr, text):
        self.feedback_id = feedback_id
        self.sentence_nr = sentence_nr
        self.text = text
        self.topic = None
        self.rating = None

Split the feedbacks into sentences and assign the sentence to an opinion:

In [10]:
nlp = spacy.load('de')

In [11]:
opinions = []
for feedback in feedbacks:
    document = nlp(feedback.feedback)
    sentence_nr = 1
    for sentence in document.sents:
        opinion = Opinion(feedback.feedback_id, sentence_nr, sentence)
        opinions.append(opinion)
        sentence_nr += 1
print('found', len(opinions), 'opinions')

found 481 opinions


Now we need a lexicon to match the lemmas.