In [None]:
import pandas as pd
pd.__version__

### Data Preprocessing 

This notebook aims at preprocessing the initial data set, preparing the data for sentiment analysis, and set up features used for machine learning model. In particular:

* The response time for each message is quantified, using the time stamps provided for each message. 

* The text of messages is cleaned up to remove http addresses, new lines, and other notations that are in the text because of ... 

* The length of conversations is quantified as the number of messages sent. 

* Messages are numbered based on their order in the conversation.

In [None]:
with open('chat_messages.pkl.gz', "rb") as f:
    df = pd.read_pickle(f, compression="gzip")

In [None]:
import dill 
dill.dump(df, open('df_raw.pkd', 'wb'))
#df = dill.load(open('df_raw.pkd', 'rb'))

### Response Time

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ResponseTime(BaseEstimator, TransformerMixin):
    ''' calculate the response delay time (seconds), 
    as the time difference b/w when a message is sent
    and when a reply is received'''
    
    def convert_timedelta(self, duration):
        days, seconds = duration.days, duration.seconds
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        seconds = (seconds % 60)
        return tuple(days, hours, minutes, seconds)
    
    def transformer(self, X):
        X['response_time'] = 0
        for i in range(1,len(X)):
            if X['channel_id'][i-1] == X['channel_id'][i]:
                duration_temp = X['created_at'][i] - X['created_at'][i-1]
                duration = duration_temp.days*24*60*60 + duration_temp.seconds 
                X.loc[i, 'response_time'] = duration
                
            else:
                X.loc[i, 'response_time'] = 0
        return X

In [None]:
rt = ResponseTime()
rt.transformer(df)

### Cleaning Text

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import string
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

class CleanText(BaseEstimator, TransformerMixin):
    
    def remove_punct(self, message):
        punct = string.punctuation
        trans = str.maketrans(punct, len(punct)*' ')
        return message.translate(trans)

    def to_lower(self, message):
        return message.lower()
    
    def remove_num(self, message):
        return self
        
    def remove_https(self, message):
        message = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '<URL>', message)
        message = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '', message)

        return message
    
    def remove_spaces(self, message):
        message = re.sub(r'▪', ' ', message)
        message = re.sub(r'<NUM>', ' ', message)
        message = re.sub(r'[\s]+', ' ', message)
        message = re.sub(r'\+', ' ', message)
        message = re.sub(r'\-', ' ', message)
        message = re.sub(r'\n', ' ', message)
        message = re.sub(r'\r', ' ', message)
        
        return message
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        clean_X = (X
                   .apply(self.remove_https)
                   .apply(self.remove_spaces))
        
                    
        return clean_X

In [None]:
ct = CleanText()
df['clean_message'] = ct.transform(df['message'])

### Conversation Length

In [None]:
df['convo_length'] = df['channel_id'].map(df['channel_id']
                                                .value_counts())
df['convo_num'] = df.groupby(['channel_id']).cumcount()+1

In [None]:
dill.dump(df, open('df_preprocessed.pkd', 'wb'))