In [183]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text  import CountVectorizer
import emoji
import regex
from nltk.tokenize import RegexpTokenizer
from string import punctuation
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import re
from scipy.sparse import hstack
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import sparse
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from nltk.corpus import wordnet
import nltk
from nltk.corpus import stopwords
import os

In [117]:
english_stopwords = stopwords.words('english')
word_tokenizer = RegexpTokenizer(r'\w+')

In [166]:
class EmoticonsCounter():
    
    def __init__(self, mood_emoticons):
        self.mood_emoticons = mood_emoticons
    
    def count_substrings(self, sentence, substrings):
        count = 0
        for sub in substrings:
            count += sentence.count(sub)
        return count
    
    def get_emo_count_row(self, row):
        emo_count = []
        for emoticons in self.mood_emoticons:
            emo_count.append(self.count_substrings(row, emoticons))
        return emo_count
        
    
    def get_emo_count_1turn(self, turn):
        return turn.apply(lambda row : self.get_emo_count_row(row))
    
    def get_emo_count_per_turn(self, data):
        result = np.zeros((data.shape[0], data.shape[1], len(self.mood_emoticons)))
        for i, col in enumerate(data.columns):
            result[:,i, :] += np.stack(self.get_emo_count_1turn(data[col]).values)
        return sparse.csr_matrix(result.reshape(data.shape[0], -1))

In [119]:
happy_emoticons = ':) ;) =) :] :P :-P :D ;D :> :3 :-) ;-) :^) :o) :~) ;^) ;o) :-D :-> XD xD'.split()
sad_emoticons = ':( =( :-( :^( :o( :-<'.split(' ')
angry_emoticons = '>:S >:{ >: x-@ :@ :-@ :-/ :-\ :/'.split(' ')
afraid_suprised_emoticons = ':-o :-O o_O O_o :$'.split(' ')
sleepy_emoticons = '-_- ~_~'.split(' ')

In [120]:
def filter_emoticons(row):
    eyes, noses, mouths = r":;8BX=>-~", r"-~'^_", r")(/\|DP<-~"
    pattern1 = "[%s][%s]?[%s]" % tuple(map(re.escape, [eyes, noses, mouths]))
    return re.findall(pattern1, row)

def counter_emoticons(row):
    return len(filter_emoticons(row))

def filter_emojis(row):
    return [c for c in row if c in emoji.UNICODE_EMOJI]

def counter_emojis(row):
    return len(filter_emojis(row))

def filter_upperletters(row):
    return [c for c in row if c.isupper()]

def counter_upperletters(row):
    return len(filter_upperletters(row))

def counter_upperletters_per_word(row):
    if len(word_tokenizer.tokenize(row)) > 0:
        return len(filter_upperletters(row)) / len(word_tokenizer.tokenize(row))
    else:
        return 0

def filter_punctuations(row):
    terminals = ".?!"
    return [c for c in row if c in terminals]

def counter_punctuations(row):
    return len(filter_punctuations(row))

def counter_words(row):
    return len(word_tokenizer.tokenize(row))

def counter_chars(row):
    return len(row)

def counter_avg_word_length(row):
    words = word_tokenizer.tokenize(row)
    if len(words) > 0:
        return (sum(len(word) for word in words) /len(words))
    else:
        return 0

def counter_stopwords(row):
    words = row.split()
    return len([word for word in words if word in english_stopwords])
    
def count_on_rows(corpus, count_function):
    _counts = np.zeros((corpus.shape[0]))
    for idx, row in enumerate(corpus):
        _counts[idx] += count_function(row)
    return _counts

def count_per_turns(data, count_function):
    columns = ['turn1', 'turn2', 'turn3']
    results = np.zeros((len(columns), data.shape[0]))
    for idx, column in enumerate(columns):
        results[idx] = count_on_rows(data[column], count_function)
    return results    

def get_most_frequent_thing_per_turns(data, frequence_function, top=30):
    columns = ['turn1', 'turn2', 'turn3']
    results = {}
    for column in columns:
        results[column] = frequence_function(data[column], top)
    return results

def get_most_frequent_thing_per_turns_per_label(data, label, frequence_function, top=30):
    data = data[data.label==label]
    columns = ['turn1', 'turn2', 'turn3']
    results = {}
    for column in columns:
        results[column] = frequence_function(data[column], top)
    return results

def count_emojis(corpus, top_n=30):
    emojis_count_dict = {}
    emojis = np.array([])
    for row in corpus:
        # Source:
        # https://stackoverflow.com/questions/44907866/convert-strings-to-emoji-in-python
        emojis = np.append(emojis, [c for c in row if c in emoji.UNICODE_EMOJI])
        # ------
    unique, counts = np.unique(emojis, return_counts=True)
    emojis_count_dict = dict(zip(unique, counts))
    emojis_count_dict = sorted(emojis_count_dict.items(), key=lambda x:x[1], reverse=True)
      
    return emojis_count_dict[:top_n]

def count_emojis_by_label(data, concrete_label, top_n=30):
    data = data[data['label'] == concrete_label]
    all_in_one = data['turn1'].map(str) + ' ' + data['turn2'].map(str) + ' ' + data['turn3']
    return count_emojis(all_in_one, top_n)

# def count_emoticons(corpus, top_n=10):
#     ## Source:
#     ## https://stackoverflow.com/questions/20582030/detect-emoticon-in-a-sentence-using-regex-python
#     eyes, noses, mouths = r":;8BX=>-~", r"-~'^_", r")(/\|DP<-~"
#     pattern1 = "[%s][%s]?[%s]" % tuple(map(re.escape, [eyes, noses, mouths]))
#     ## ------
#     emoticons = np.array([])
#     for row in corpus:
#         emoticons = np.append(emoticons, re.findall(pattern1, row))
#     unique, counts = np.unique(emoticons, return_counts=True)
#     emoticons_count_dict = dict(zip(unique, counts))
#     emoticons_count_dict = sorted(emoticons_count_dict.items(), key=lambda x:x[1], reverse=True)
      
#     return emoticons_count_dict[:top_n]

# def count_emoticons_by_label(data, concrete_label, top_n=30):
#     data = data[data['label'] == concrete_label]
#     all_in_one = data['turn1'].map(str) + ' ' + data['turn2'].map(str) + ' ' + data['turn3']
#     return count_emoticons(all_in_one, top_n)

In [121]:
##https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying",   
}

In [122]:
replaces = APPO.copy()
replaces['ur'] = 'you are'
replaces['u'] = 'you'
replaces['r'] = 'are'
replaces['yr'] = 'your'


In [123]:
from IPython.display import display

In [124]:
data_path = 'D:\\Machine Learning\\Datasets\\EmoContext'
train_data = pd.read_csv(data_path+'\\train.txt',
                        sep='\t', index_col='id')
dev_data = pd.read_csv(data_path+'\\dev.txt',
                        sep='\t', index_col='id')
test = pd.read_csv(data_path+'\\testwithoutlabels.txt',
                        sep='\t', index_col='id')

In [125]:
X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]

In [126]:
with pd.option_context("display.max_rows", 1000):
    display(X)
    

Unnamed: 0_level_0,turn1,turn2,turn3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Don't worry I'm girl,hmm how do I know if you are,What's ur name?
1,When did I?,saw many times i think -_-,No. I never saw you
2,By,by Google Chrome,Where you live
3,U r ridiculous,I might be ridiculous but I am telling the truth.,U little disgusting whore
4,Just for time pass,wt do u do 4 a living then,Maybe
5,I'm a dog person,youre so rude,Whaaaat why
6,So whatsup,Nothing much. Sitting sipping and watching TV....,What are you watching on tv?
7,Ok,ok im back!!,"So, how are u"
8,Really?,really really really really really,Y saying so many times...i can hear you
9,Bay,in the bay,😘 love you


In [167]:
emo_counter = EmoticonsCounter([happy_emoticons, sad_emoticons,
                                 angry_emoticons, afraid_suprised_emoticons,
                                 sleepy_emoticons])

emo_counter.get_emo_count_per_turn(X.iloc[0:25,:])

<25x15 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

Tfidf, Emoticons, Emojis, punctuation, best features with lighGBM,
logitic, chi-test, appostrphe used

In [127]:
def filter_text(data, filter_txt):
    rows = data.copy()
    for idx, row in enumerate(rows):
        rows.iloc[idx] = ''.join(filter_txt(row))
    return rows

def filter_text_setFilt(data, set_of):
    rows = data.copy()
    for idx, row in enumerate(rows):
        rows.iloc[idx] = ''.join([c for c in row if c in set_of])
    return rows

def filter_text_allturns_3SetFilters(data, setFilters):
    filtered_data = data.copy()
    for i in range(3):
        for j in range(data.shape[0]):
            filtered_data.iat[j,i] = ''.join([c for c in data.iat[j,i] if c in setFilters[i]])
    return filtered_data

def filter_text_allturns(data, set_of):
    filtered_data = data.copy()
    for i in range(3):
        for j in range(data.shape[0]):
            filtered_data.iat[j,i] = ''.join([c for c in data.iat[j,i] if c in set_of])
    return filtered_data
    

In [128]:
def getVectorized(corpus):
    cv = CountVectorizer(analyzer = 'char',ngram_range=(1,1))
    return cv.fit_transform(corpus)

def getVectorized3Turns(data):
    turn1 = getVectorized(data['turn1'])
    turn2 = getVectorized(data['turn2'])
    turn3 = getVectorized(data['turn3'])
    return hstack([turn1, turn2, turn3])
    
    

In [130]:
def features_counters(data, counter_funcs):
        feats = []
        for count in counter_funcs:
            ar = []
            for turn in data.columns:
                ar.append(data[turn].map(lambda x: count(x)).values)
            feats.append(sparse.csr_matrix(np.array(ar).T))
        return hstack(feats)        

In [149]:
class FeaturesBuilder(BaseEstimator, TransformerMixin):
    def __init__(self, filtersFunctions=[], punctuationCount=False):
        self.filtersFunctions = filtersFunctions
        self.punctuationCount = punctuationCount
        self.building_blocks = []
        self.featuresSet = []
    
    def get_filtered_data(self, data, filters):
        if len(filters) > 1:
            filtered_data = filter_text_allturns_3SetFilters(data, filters)
        else:
            filtered_data = filter_text_allturns(data, filters)

        return filtered_data 

    def getVectorized(self, corpus):
        cv = CountVectorizer(analyzer = 'char',ngram_range=(1,1))
        return cv.fit_transform(corpus)

    def getVectorized3Turns(self, data):
        turns = []
        cvs = []
        for label in ['turn1', 'turn2', 'turn3']:
            cv = CountVectorizer(analyzer = 'char',ngram_range=(1,1))
            cv.fit(data[label])
            turns.append(cv.transform(data[label]))
            cvs.append(cv)
        return hstack(turns), cvs

    def get_filters(self, data, y, filter_func, separatedFilters=True):
        top_filters = []
        for label in y.unique():
            top_filters.append(get_most_frequent_thing_per_turns_per_label(train_data,
                                                                            label, filter_func))
        if separatedFilters:
            filters_turn1 = set()
            filters_turn2 = set()
            filters_turn3 = set()
            for emojis in top_filters:
                    filters_turn1.update([x[0] for x in emojis['turn1']])
                    filters_turn2.update([x[0] for x in emojis['turn2']])
                    filters_turn3.update([x[0] for x in emojis['turn3']])
            filters = [filters_turn1, filters_turn2, filters_turn3]
        else:
            filters = set()
            for emojis in top_emojis:
                for turn in ['turn1', 'turn2', 'turn3']:
                    filters.update([x[0] for x in top_filters[turn]])

            filter_text_allturns()

        return filters

    
    def get_features_label_based(self, data, filter_func,separatedFilters=True):
        filters = get_filters(data, filter_func, separatedFilters)
    
        return features_3turns_setFilters(data, filters)
    
    def fit(self, X, y):
        for filtFunc in self.filtersFunctions:
            filters = self.get_filters(X, y, filtFunc)
            filtered_data = self.get_filtered_data(X, filters)
            features, vectorizers = self.getVectorized3Turns(filtered_data)
            self.building_blocks.append(features_building_block(filters, vectorizers))
            self.featuresSet.append(features)
        
        return self
    
    def transform(self, X):
        features = []
        for build_block in self.building_blocks:
            filtered_data = self.get_filtered_data(X, build_block.filters)
            for idx, cv in enumerate(build_block.vectorizers):
                features.append(cv.transform(filtered_data.iloc[:,idx]))
        if self.punctuationCount:
            filtered_data = filter_text_allturns(X, ['!','?','.',','])
            feats, cvs = self.getVectorized3Turns(filtered_data)
            features.append(feats)  
        self.ft = features
        return hstack(features)
            
class features_building_block:
    def __init__(self, filters, vectorizers):
        self.filters = filters
        self.vectorizers = vectorizers
        
        

In [132]:
features_counters(X.iloc[:30,:],[counter_upperletters])

<30x3 sparse matrix of type '<class 'numpy.int64'>'
	with 67 stored elements in Compressed Sparse Row format>

In [133]:
def features_3turns_setFilters(data, filters):
    if len(filters) > 1:
        filtered_data = filter_text_allturns_3SetFilters(data, filters)
    else:
        filtered_data = filter_text_allturns(data, filters)
        
    return getVectorized3Turns(filtered_data)

def get_filters(data, filter_function, separatedFilters=True):
    top_filters = []
    for label in y.unique():
        top_filters.append(get_most_frequent_thing_per_turns_per_label(train_data,
                                                                        label, filter_func))
    if separatedFilters:
        filters_turn1 = set()
        filters_turn2 = set()
        filters_turn3 = set()
        for emojis in top_emojis:
                filters_turn1.update([x[0] for x in emojis['turn1']])
                filters_turn2.update([x[0] for x in emojis['turn2']])
                filters_turn3.update([x[0] for x in emojis['turn3']])
        filters = [filters_turn1, filters_turn2, filters_turn3]
    else:
        filters = set()
        for emojis in top_emojis:
            for turn in ['turn1', 'turn2', 'turn3']:
                filters.update([x[0] for x in top_filters[turn]])
        
        filter_text_allturns()
        
    return filters

def get_features_label_based(data, filter_func,separatedFilters=True):
    filters = get_filters(data, filter_func, separatedFilters)
    
    return features_3turns_setFilters(data, filters)

def get_features_for_test_set_label_based(train_data, test_data, filters):
    filters = get_filters(train_data, filter_func, separatedFilters)
    
    return features_3turns_setFilters(test_data, filters)
        
    
    

In [134]:
nrc_filepath = 'D:\\Machine Learning\\Sentiment Lexicons\\NRC-Sentiment-Emotion-Lexicons\\NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
NRC_lexicon = pd.read_csv(nrc_filepath, names=["word", "emotion", "association"],sep='\t')

In [18]:
words = NRC_lexicon.word.unique()

array(['aback', 'abacus', 'abandon', ..., 'zoological', 'zoology', 'zoom'],
      dtype=object)

In [56]:
words = NRC_lexicon.word.unique()
nrc_word_lexicon = np.zeros((len(words), 10))
for i, word in enumerate(words):
    word_values = NRC_lexicon[NRC_lexicon.word==word].association.values
    if i % 1000 == 0:
            print('=======' + str(i) + '======')
    if np.sum(word_values) > 0:
        emotion_idx = np.argmax(word_values)
        nrc_word_lexicon[i,emotion_idx] = 1
        if i % 1000 == 0:
            print('=======' + str(i) + '======')
nrc_word_lexicon



array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [115]:
words_df = pd.DataFrame(words, columns=['word'])

In [117]:
emotions_df = pd.DataFrame(nrc_word_lexicon,columns=NRC_lexicon.emotion.unique())

In [118]:
NRC_lexicon_df = pd.concat([words_df, emotions_df], axis=1)

In [119]:
NRC_lexicon_df.head()

Unnamed: 0,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,abacus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,abandon,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,abandoned,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,abandonment,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
nrc_save_filepath = 'D:\\Machine Learning\\Sentiment Lexicons\\NRC-Sentiment-Emotion-Lexicons\\NRC_Em_Lex_0.92_DF.csv'
NRC_lexicon_df.to_csv(nrc_save_filepath, index=False)

In [123]:
NRC_lexicon_df[NRC_lexicon_df.word == 'abacus'].iloc[:,1:].values.shape

(1, 10)

In [136]:
tw_tokenizer = TweetTokenizer()

In [135]:
class LexiconFeatureBuilder():
    def __init__(self,lexicon, tokenizer=TweetTokenizer()):
        self.lexicon = lexicon
        self.tokenizer = tokenizer
    
    def _emotion_by_row(self, row):
        words = self.lexicon.word.values
        row_toknd = self.tokenizer.tokenize(row)
        emotions_count = self.lexicon.shape[1] - 1
        row_emotion = np.zeros((1, emotions_count))
        for token in row_toknd:
            if token in words:
                  row_emotion += self.lexicon[self.lexicon.word == token.lower()].iloc[:,1:].values
        return row_emotion.reshape(emotions_count)
                    
    def _emotion_by_turn(self, turn):
        return turn.apply(lambda row : self._emotion_by_row(row))
    
    def transform(self, data, returnSparse=True):
        emo_turns = []
        for col_idx in range(data.shape[1]):
            emo_turns.append(self._emotion_by_turn(data.iloc[:,col_idx]))
        result =np.hstack([np.stack(turn.values) for turn in emo_turns])
        if returnSparse:
            return  sparse.csc_matrix(result)
        else:
            return result

In [480]:
NRC_Lex_Fe_Buider = LexiconFeatureBuilder(NRC_lexicon_df, TweetTokenizer())
NRC_Lex_Fe_Buider.transform(X.head(5))

<5x30 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Column format>

In [137]:
depem_filepath = r'D:\Machine Learning\Sentiment Lexicons\DepecheMood-master\DepecheMood_english_lemma_full.tsv'
DM_lexicon = pd.read_csv(depem_filepath,sep='\t')

In [138]:
DM_lexicon.columns = DM_lexicon.columns.str.lower()

In [479]:
DM_Lex_Fe_Buider = LexiconFeatureBuilder(DM_lexicon, TweetTokenizer())
DM_Lex_Fe_Buider.transform(X.head(5))

<5x27 sparse matrix of type '<class 'numpy.float64'>'
	with 99 stored elements in Compressed Sparse Column format>

In [139]:
class Lemmatization():
    def __init__(self, tokenizer= TweetTokenizer(), wn_lema=WordNetLemmatizer()):
        self.tokenizer = tokenizer
        self.wn_lema = wn_lema
    
    def word_lemmatization(self, word, pos_tag):
        wordnet_pos = self.get_wordnet_pos(pos_tag)
        if wordnet_pos == '':
            return word
        else:
            return self.wn_lema.lemmatize(word, wordnet_pos)
        
    # https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    def get_wordnet_pos(self, treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''
    
    def row_lemmatization(self, row):
        tokens = self.tokenizer.tokenize(row)
        pos_tags = nltk.pos_tag(tokens)
        return ' '.join([self.word_lemmatization(word,pos_tag) for (word, pos_tag) in pos_tags])
    
    def turn_lemmatization(self, turn):
        return turn.apply(lambda row : self.row_lemmatization(row))
        
    def lemmatize_data(self, data):
        data_cpy = data.copy()
        for column in data.columns:
            data_cpy[column] = self.turn_lemmatization(data[column])
        return data_cpy
        

In [350]:
lematizator = Lemmatization()
lematizator.lemmatize_data(X.head(5))

Unnamed: 0_level_0,turn1,turn2,turn3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Don't worry I'm girl,hmm how do I know if you be,What's ur name ?
1,When do I ?,saw many time i think - _ -,No . I never saw you
2,By,by Google Chrome,Where you live
3,U r ridiculous,I might be ridiculous but I be tell the truth .,U little disgusting whore
4,Just for time pas,wt do u do 4 a living then,Maybe


In [140]:
class TokenReplacer():
    def __init__(self, replaces,tokenizer=TweetTokenizer()):
        self.tokenizer= tokenizer
        self.replaces = replaces
    
    def word_replacer(self, word):
        if word in self.replaces:
            return self.replaces[word]
        else:
            return word
    
    def row_rep(self, row):
        tokens = self.tokenizer.tokenize(row)
        return ' '.join([self.word_replacer(token.lower()) for token in tokens])
    
    def turn_rep(self, turn):
        return turn.apply(lambda row : self.row_rep(row))
    
    def replace_data(self, data):
        data_cpy = data.copy()
        for column in data.columns:
            data_cpy[column] = self.turn_rep(data[column])
        return data_cpy

In [141]:
tr = TokenReplacer(APPO)
tr.replace_data(X.head(5))

Unnamed: 0_level_0,turn1,turn2,turn3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,do not worry I am girl,hmm how do i know if you are,what is ur name ?
1,when did i ?,saw many times i think - _ -,no . i never saw you
2,by,by google chrome,where you live
3,u r ridiculous,i might be ridiculous but i am telling the tru...,u little disgusting whore
4,just for time pass,wt do u do 4 a living then,maybe


In [142]:
def text_data_to_lowercase(data):
    data_cpy = data.copy()
    for column in data.columns:
        data_cpy[column] = data[column].apply(lambda row: row.lower())
    return data_cpy

In [414]:
text_data_to_lowercase(X.head(5))

Unnamed: 0_level_0,turn1,turn2,turn3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,don't worry i'm girl,hmm how do i know if you are,what's ur name?
1,when did i?,saw many times i think -_-,no. i never saw you
2,by,by google chrome,where you live
3,u r ridiculous,i might be ridiculous but i am telling the truth.,u little disgusting whore
4,just for time pass,wt do u do 4 a living then,maybe


In [143]:
class PosTagsCounter(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer=RegexpTokenizer(r'[a-zA-Z][^\s]*\b')):
        self.tokenizer = tokenizer
        self.cvs = []
    
    def pos_tag_row(self,row):
        tokens = self.tokenizer.tokenize(row)
        pos_tags = nltk.pos_tag(tokens)
        return ' '.join([pos_tag for (word, pos_tag) in pos_tags])
    
    def pos_tag_turn(self, turn):
        return turn.apply(lambda row: self.pos_tag_row(row))
        
    def pos_tag_data(self, data):
        data_cpy = data.copy()
        for turn in data.columns:
            data_cpy[turn] = self.pos_tag_turn(data[turn])
        return data_cpy
    
    def fit_vectorizers(self, data):
        self.cvs = []
        for column in data.columns:
            cv = CountVectorizer(analyzer = 'word',ngram_range=(1,1))
            cv.fit(data[column])
            self.cvs.append(cv)
        return self.cvs
    
    def fit(self, data):
        data_pos = self.pos_tag_data(data)
        self.fit_vectorizers(data_pos)
        return self
    
    def transform(self, data):
        data_pos = self.pos_tag_data(data)
        vect_columns = []
        for cv, column in zip(self.cvs,data.columns):
            vect_columns.append(cv.transform(data_pos[column]))
        return hstack(vect_columns)      

In [379]:
posTagsCounter = PosTagsCounter(RegexpTokenizer(r'[a-zA-Z][^\s]*\b'))
posTagsCounter.fit(X.head(5))
posTagsCounter.transform(X.head(5))

<5x49 sparse matrix of type '<class 'numpy.int64'>'
	with 51 stored elements in COOrdinate format>

In [446]:
NRC_Lex_Fe_Buider._emotion_by_turn(X.head(2).turn1)

array([array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
       array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])], dtype=object)

In [452]:
NRC_Lex_Fe_Buider = LexiconFeatureBuilder(NRC_lexicon_df, TweetTokenizer())
NRC_Lex_Fe_Buider.transform(X.head(2)).toarray()

id
0    [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: turn1, dtype: object

In [454]:
NRC_Lex_Fe_Buider.transform(X.head(2)).toarray()

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [449]:
sparse.csr_matrix(np.stack(NRC_Lex_Fe_Buider._emotion_by_turn(X.head(2).turn1).values))

<2x10 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [221]:
np.stack(NRC_Lex_Fe_Buider._emotion_by_turn(X.head(2).turn1).values)

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

# Feature Extaction ==================

In [153]:
train_feats = {}
dev_feats = {}
test_feats= {}

Counter ---

In [154]:
fb_emoji = FeaturesBuilder([count_emojis])
train_feats['emojis_features'] = fb_emoji.fit_transform(X ,y)
dev_feats['emojis_features'] = fb_emoji.transform(dev_data[X.columns])
test_feats['emojis_features'] = fb_emoji.transform(test)

In [156]:
fb_punct = FeaturesBuilder(punctuationCount=True)
train_feats['punctuation_features'] = fb_punct.fit_transform(X ,y)
dev_feats['punctuation_features'] = fb_punct.transform(dev_data[X.columns])
test_feats['punctuation_features'] = fb_punct.transform(test)

counter_upperletters counter_upperletters_per_word counter_words  
counter_chars counter_avg_word_length counter_stopwords

In [157]:
train_feats['upperletters_pword__cnt_features'] = features_counters(X, [counter_upperletters_per_word])
dev_feats['upperletters_pword__cnt_features'] = features_counters(dev_data[X.columns], [counter_upperletters_per_word])
test_feats['upperletters_pword__cnt_features'] = features_counters(test, [counter_upperletters_per_word])

In [158]:
train_feats['upperletters_cnt_features'] = features_counters(X, [counter_upperletters])
dev_feats['upperletters_cnt_features'] = features_counters(dev_data[X.columns], [counter_upperletters])
test_feats['upperletters_cnt_features'] = features_counters(test, [counter_upperletters])

In [159]:
train_feats['words_cnt_features'] = features_counters(X, [counter_words])
dev_feats['words_cnt_features'] = features_counters(dev_data[X.columns], [counter_words])
test_feats['words_cnt_features'] = features_counters(test, [counter_words])

In [160]:
train_feats['chars_cnt_features'] = features_counters(X, [counter_chars])
dev_feats['chars_cnt_features'] = features_counters(dev_data[X.columns], [counter_chars])
test_feats['chars_cnt_features'] = features_counters(test, [counter_chars])

In [161]:
train_feats['avg_word_leng_cnt_features'] = features_counters(X, [counter_avg_word_length])
dev_feats['avg_word_leng_cnt_features'] = features_counters(dev_data[X.columns], [counter_avg_word_length])
test_feats['avg_word_leng_cnt_features'] = features_counters(test, [counter_avg_word_length])

In [168]:
emo_counter = EmoticonsCounter([happy_emoticons, sad_emoticons,
                                 angry_emoticons, afraid_suprised_emoticons,
                                 sleepy_emoticons])

train_feats['emoticons_count'] =  emo_counter.get_emo_count_per_turn(X)
dev_feats['emoticons_count'] =  emo_counter.get_emo_count_per_turn(dev_data[X.columns])
test_feats['emoticons_count'] =  emo_counter.get_emo_count_per_turn(test) 

In [162]:
train_feats['stopwords_cnt_features'] = features_counters(X, [counter_stopwords])
dev_feats['stopwords_cnt_features'] = features_counters(dev_data[X.columns], [counter_stopwords])
test_feats['stopwords_cnt_features'] = features_counters(test, [counter_stopwords])

In [551]:
tr = TokenReplacer(replaces)

In [163]:
train_feats['stopwords_cnt_R_features'] = features_counters(tr.replace_data(X), [counter_stopwords])
dev_feats['stopwords_cnt_R_features'] = features_counters(tr.replace_data(dev_data[X.columns]), [counter_stopwords])
test_feats['stopwords_cnt_R_features'] = features_counters(tr.replace_data(test), [counter_stopwords])

Appostrophe

In [164]:
X_repld = tr.replace_data(X)
dev_repld = tr.replace_data(dev_data[X.columns])
test_repld = tr.replace_data(test)

POS Tags

In [165]:
posTagsCounter = PosTagsCounter(RegexpTokenizer(r'[a-zA-Z][^\s]*\b'))
posTagsCounter.fit(X_repld)
train_feats['pos_tag_features'] = posTagsCounter.transform(X_repld)
dev_feats['pos_tag_features'] = posTagsCounter.transform(dev_repld)
test_feats['pos_tag_features'] = posTagsCounter.transform(test_repld)

In [169]:
train_feats['pos_tag_features'].toarray()[2]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1], dtype=int64)

Lower, Lemmatization

In [170]:
X_repld_low = text_data_to_lowercase(X_repld)
dev_repld_low = text_data_to_lowercase(dev_repld)
test_repld_low = text_data_to_lowercase(test_repld)

In [171]:
lemmatizor = Lemmatization()
X_repld_low_lemmatized = lemmatizor.lemmatize_data(X_repld_low)
dev_repld_low_lemmatized = lemmatizor.lemmatize_data(dev_repld_low)
test_repld_low_lemmatized = lemmatizor.lemmatize_data(test_repld_low)

In [172]:
X_repld_low_lemmatized.head()

Unnamed: 0_level_0,turn1,turn2,turn3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,do not worry i be girl,hmm how do i know if you be,what be ur name ?
1,when do i ?,saw many time i think - _ -,no . i never saw you
2,by,by google chrome,where you live
3,u r ridiculous,i might be ridiculous but i be tell the truth .,u little disgusting whore
4,just for time pas,wt do u do 4 a living then,maybe


In [None]:
Lexicons

In [173]:
nrc_df_filepath = 'D:\\Machine Learning\\Sentiment Lexicons\\NRC-Sentiment-Emotion-Lexicons\\NRC_Em_Lex_0.92_DF.csv'
NRC_lexicon = pd.read_csv(nrc_df_filepath)

In [174]:
NRC_lexicon.head()

Unnamed: 0,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,abacus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,abandon,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,abandoned,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,abandonment,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
lex_nrc = LexiconFeatureBuilder(NRC_lexicon)
train_feats['nrc_features'] = lex_nrc.transform(X_repld_low_lemmatized)
dev_feats['nrc_features'] = lex_nrc.transform(dev_repld_low_lemmatized)
test_feats['nrc_features'] = lex_nrc.transform(test_repld_low_lemmatized)

In [176]:
depem_filepath = r'D:\Machine Learning\Sentiment Lexicons\DepecheMood-master\DepecheMood_english_lemma_full.tsv'
DM_lexicon = pd.read_csv(depem_filepath,sep='\t')
DM_lexicon.columns = DM_lexicon.columns.str.lower()
DM_Lex_Fe_Buider = LexiconFeatureBuilder(DM_lexicon)
train_feats['dm_lex_features'] = DM_Lex_Fe_Buider.transform(X_repld_low_lemmatized)
dev_feats['dm_lex_features'] = DM_Lex_Fe_Buider.transform(dev_repld_low_lemmatized)
test_feats['dm_lex_features'] = DM_Lex_Fe_Buider.transform(test_repld_low_lemmatized)

In [601]:
dm_lex_features.toarray()[10]

array([7.96370131e-02, 2.97096977e-01, 2.05889902e-01, 2.73493851e-01,
       5.31608850e-01, 1.33226787e-01, 3.30327503e-01, 1.48719117e-01,
       1.18200000e+03, 1.02282570e-01, 2.86147497e-01, 1.91042270e-01,
       2.59537121e-01, 5.18422176e-01, 1.69610306e-01, 3.22170628e-01,
       1.50787432e-01, 3.42160000e+04, 6.19320432e-02, 1.94036473e-01,
       1.09107406e-01, 1.58131801e-01, 1.39135689e-01, 8.85433262e-02,
       1.85246256e-01, 6.38670058e-02, 2.17600000e+03])

In [186]:
def save_feats(feats_dict, base_dir, folder):
    features_path = os.path.join(base_dir, folder)
    if not os.path.isdir(features_path):
         os.mkdir(features_path)
    sparse.save_npz(features_path + r'\emoticons_count.npz', feats_dict['emoticons_count'])
    sparse.save_npz(features_path + r'\nrc_lexicon_feats.npz', feats_dict['nrc_features'])
    sparse.save_npz(features_path + r'\dm_lexicon_feats.npz', feats_dict['dm_lex_features'])
    sparse.save_npz(features_path + r'\pos_tag_features.npz', feats_dict['pos_tag_features'])
    sparse.save_npz(features_path + r'\emoji_feats.npz', feats_dict['emojis_features'])
    sparse.save_npz(features_path + r'\punctuation_feats.npz', feats_dict['punctuation_features'])
    sparse.save_npz(features_path + r'\upperletters_pword__cnt_features.npz', feats_dict['upperletters_pword__cnt_features'])
    sparse.save_npz(features_path + r'\upperletters_cnt_features.npz', feats_dict['upperletters_cnt_features'])
    sparse.save_npz(features_path + r'\chars_cnt_features.npz', feats_dict['chars_cnt_features'])
    sparse.save_npz(features_path + r'\avg_word_leng_cnt_features.npz', feats_dict['avg_word_leng_cnt_features'])
    sparse.save_npz(features_path + r'\stopwords_cnt_R_features.npz', feats_dict['stopwords_cnt_R_features'])

In [187]:
features_path = 'D:\\Machine Learning\\Datasets\\EmoContext\\Features\\' 
save_feats(train_feats, features_path, 'train')
save_feats(dev_feats, features_path, 'dev')
save_feats(test_feats, features_path, 'test')

In [185]:
X_repld_low_lemmatized.to_csv(features_path+'X_repld_low_lemmatized.csv', index=False)
dev_repld_low_lemmatized.to_csv(features_path+'dev_repld_low_lemmatized.csv', index=False)
test_repld_low_lemmatized.to_csv(features_path+'test_repld_low_lemmatized.csv', index=False)

In [606]:
nrc_save_filepath = 'D:\\Machine Learning\\Sentiment Lexicons\\NRC-Sentiment-Emotion-Lexicons\\NRC_Em_Lex_0.92_DF.csv'
NRC_lexicon_df.to_csv(nrc_save_filepath, index=False)