### Imports and constants

In [None]:
import spacy
import re
import datetime
import pickle

from spacy_spanish_lemmatizer import SpacyCustomLemmatizer


# =================== CONSTANTS ===================
SPACE = ' '
COLON = ':'
DASH = '-'
DOT = '.'

UNIT_THRESHOLD = 5
TIME_THRESHOLD = 600
MIN_WORD_LENGTH = 3

LANGUAGE = 'es'

### UNIT CLASS

In [None]:
import datetime

class Unit:
    def __init__(self, dt, sender, message):
        self.dt = self.get_datetime(dt)
        self.sender = sender
        self.message = message
    
    def same_sender(self, sender):
        return self.sender == sender
    
    def add_text(self, text):
        self.message += '. '.join([text])
    
    def set_dt(self, dt):
        self.dt = dt
    
    def timedelta(self, dt):
        if self.dt > dt:
            return (self.dt-dt).seconds
        return (dt-self.dt).seconds

    def get_datetime(self, dt):
        date, time = dt.split(',')
        month, day, year = date.split('/')
        hour, minutes = time.split(':')
        year = 2000 + int(year)
        return datetime.datetime(year, int(month), int(day), int(hour), int(minutes))

### GET_CORPUS AND PRE_PROCESSING

In [None]:
reg = [
    [r'(\w)\1{2,}',r'\1'],
    [r'qu\w',''],
    [r'no\w','no'],
    ["jaj[ja]+",""],
    ["mñna", "mañana"],
    ["cba", "Cordoba"],
    ["si[si]+", "si"]
]
replace = {
    "hno": "hermano",
    "hdp": "insulto",
    "dpto": "departamento",
    
    'a':'',
    'y':'',
    'o':'',
    'dale':'',
    'hola':'',
    'ok':'',
    'oka':'',
    'ver':'',
    'estar':'',
    'decir':'',
    'pasar':'',
    'che':'',
    'ya':'',
    'yaa':'',
    'eia':'',
    'meno':'',
    'message':'',
    'deleted':'',
    'you':'',
    '<media omitted>':''
}

In [None]:
def get_corpus(corpus_filepath):
    # Get text.
    with open(corpus_filepath) as f:
        lines = f.read().lower()
    lines = lines.split('\n')
    corpus = []
    # Get list of Units.
    # Format of lines: date, time - sender: message
    for line in lines:
        unit = line.split(DASH)
        if len(unit) > 1:
            for exp in reg:
                unit[1] = re.sub(exp[0],exp[1],unit[1])
            if len(unit[0].split(',')) > 1 and len(unit[1].split(COLON)) > 1:
                unit = Unit(unit[0],unit[1].split(COLON)[0], unit[1].split(COLON)[1])
                if len(corpus) > 0 and corpus[-1].same_sender(unit.sender) and \
                    corpus[-1].timedelta(unit.dt) < TIME_THRESHOLD:
                        corpus[-1].add_text(unit.message)
                        corpus[-1].set_dt(unit.dt)
                else:
                    corpus.append(unit)
    return corpus

In [None]:
def pre_processing(corpus_filepath):
    corpus = get_corpus(corpus_filepath)
    # Skip units which has less than UNIT_THRESHOLD words.
    senders = []
    messages = []
    original_messages = []
    nlp = spacy.load(LANGUAGE)
    # Add lemmatizer from https://github.com/pablodms/spacy-spanish-lemmatizer
    lemmatizer = SpacyCustomLemmatizer()
    nlp.add_pipe(lemmatizer, name="lemmatizer", after="tagger")
    print('Number of units {}',len(corpus))
    for unit in corpus:
        senders.append(unit.sender)
        message = nlp(unit.message)
        process_words = []
        for word in message:
            # Remove non alpha words and make words lower with lemmatization or stop words.
            if not word.text.isalpha or word.is_stop:
                continue
            word = word.lemma_.lower()
            # Remove or replace specific words.
            if word in replace:
                word = replace[word]
            # Remove words with less than 3 characters.
            if len(word) < MIN_WORD_LENGTH:
                continue
            process_words.append(word)
        if len(process_words) < UNIT_THRESHOLD:
            continue
        original_messages.append(unit.message)
        messages.append(process_words)
    return messages, original_messages