In [1]:
# Imports
import os
import sys
import pandas as pd
import nltk
import string
import numpy as np
import re
from collections import Counter
import string

In [2]:
# Define path
os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_preprocessed')

# Read datasets
submissions = pd.read_csv("submissions_simplified.csv")
submissions_tokenized = pd.read_csv("submissions_tokenized_v2.csv") # Tokenized with removed punctuation
#submissions_tokenized = pd.read_csv("submissions_tokenized.csv") # Tokenized without removed punctuation

In [3]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, c) for w, p, c in zip(s["Words"].values.tolist(),
                                                           s["POS_tag"].values.tolist(),
                                                           s["Chunk_tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [4]:
getter = SentenceGetter(submissions_tokenized)
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [5]:
from nltk import pos_tag
from nltk import RegexpParser
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.sentiment import SentimentIntensityAnalyzer

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

All possible features we can draw out from spacy
https://spacy.io/api/doc

In [6]:
def spacy_feats_sentence(index, sentence):
    data = pd.DataFrame()
    string = "Sentence " + str(index)
    whole_sent = " ".join(sentence)
    doc = nlp(whole_sent)
    
    for X in doc:
        data = data.append([[string, X.i, X.text, X.tag_, X.ent_type_, X.ent_iob_, X.lemma_, X.norm_, X.shape_, 
                             X.is_digit, X.is_ascii, X.is_alpha, X.is_punct, X.is_left_punct, X.is_right_punct,
                             X.is_bracket, X.is_quote, X.is_stop, X.dep_, X.lang_, X.prob, X.sentiment, X.is_lower, 
                             X.is_upper, X.like_num, X.is_oov, X.tensor]])
    
    data.columns = ['Sentence','Token_index','Token','POS_tag','NER_tag','NER_iob','lemma','norm','shape',
                                 'is_digit', 'is_ascii', 'is_alpha', 'is_punct', 'is_left_punct', 'is_right_punct',
                                 'is_bracket', 'is_quote', 'stopword', 'dependency', 'language', 'log_prob', 'sent', 
                                 'is_lower', 'is_upper', 'like_num', 'out_of_vocab', 'tensor']
    return data

def spacy_feats_all(sentences):
    data = pd.DataFrame()
    
    for i in range(0, len(sentences)):
        tmp_df = spacy_feats_sentence(i, sentences[i])
        data = data.append([tmp_df])
    
    data.columns = ['Sentence','Token_index','Token','POS_tag','NER_tag','NER_iob','lemma','norm','shape',
                                 'is_digit', 'is_ascii', 'is_alpha', 'is_punct', 'is_left_punct', 'is_right_punct',
                                 'is_bracket', 'is_quote', 'stopword', 'dependency', 'language', 'log_prob', 'sent', 
                                 'is_lower', 'is_upper', 'like_num', 'out_of_vocab', 'tensor']
    return data

In [7]:
tmp = spacy_feats_all(sentences)

Various sentiment features with the pretrained Vader Sentiment Analyser from NLTK

In [8]:
def sentiment_feats(sentence, words):
    all_sentence_scores = pd.DataFrame()
    words = words.tolist()
    whole_sent = " ".join(sentence)
    analyzer = SentimentIntensityAnalyzer()
    s_neg_score, s_neu_score, s_pos_score, s_compound_score = analyzer.polarity_scores(whole_sent).values()
    for i in range(0, len(words)):
        w_neg_score, w_neu_score, w_pos_score, w_compound_score = analyzer.polarity_scores(words[i]).values()
        if s_compound_score == 0:
            s_compound_score = 0.0001
        sent_ratio = w_compound_score/s_compound_score
        if i == 0:
            predecessor_sent = 0
        else:
            _,_,_,predecessor_sent = analyzer.polarity_scores(words[i-1]).values()
        if i < len(words)-1:
            _,_,_,successor_sent = analyzer.polarity_scores(words[i+1]).values()
        else:
            successor_sent = 0
        all_sentence_scores = all_sentence_scores.append([[s_compound_score, w_neg_score, w_neu_score, w_pos_score, w_compound_score, sent_ratio, predecessor_sent, successor_sent]])
    all_sentence_scores.columns = ["Sentence_sent", "Neg_sent_score", "Neu_sent_score", "Pos_sent_score", "Sent_score", "word_to_sentence_sent_ratio", "prev_word_sent", "next_word_sent"]
    return all_sentence_scores

def sentiment_feats_all(sentences, df_subset):
    feats = pd.DataFrame()
    for i in range(0, len(sentences)):
        words_curr_sent = df_subset[df_subset["Sentence"] == "Sentence " + str(i)]["Token"]
        new_feats = sentiment_feats(sentences[i], words_curr_sent)
        feats = feats.append([new_feats])
    feats.columns = ["Sentence sent", "Neg_sent_score", "Neu_sent_score", "Pos_sent_score", "Sent_score", "word_to_sentence_sent_ratio", "prev_word_sent", "next_word_sent"]
    return feats

In [9]:
feats = sentiment_feats_all(sentences, tmp[["Sentence", "Token"]])

Extract the most common bigrams and trigrams that occur throughout the text corpus.

In [10]:
def extract_bigrams_trigrams(sentences):
    all_bigrams = []
    all_trigrams = []
    for sentence in sentences:
        lower_sent = [s.lower() for s in sentence]
        bigrams = nltk.bigrams(lower_sent)
        trigrams = nltk.trigrams(lower_sent)
        for bigram in bigrams:
            all_bigrams.append(bigram)
        for trigram in trigrams:
            all_trigrams.append(trigram)
    freqs_bg = nltk.FreqDist(all_bigrams)
    freqs_tg = nltk.FreqDist(all_trigrams)
    bg_cnt = Counter()
    tg_cnt = Counter()
    for k, v in freqs_bg.items():
        bg_cnt[k] = v
    for k, v in freqs_tg.items():
        tg_cnt[k] = v
    return bg_cnt.most_common(30), tg_cnt.most_common(30)

In [11]:
bigrams, trigrams = extract_bigrams_trigrams(sentences)

In [169]:
def tf_idf_ngrams(ngrams, words):
    feats = pd.DataFrame()
    n = len(ngrams[0][0])
    ngram_tuples = [ngram[0] for ngram in ngrams]
    ngram_counts = [ngram[1] for ngram in ngrams]
    words = words.tolist()
    is_in_sent = []
    is_predecessor = []
    
    if n == 2:
        curr_sent_ngrams = nltk.bigrams(words)
        curr_sent_ngrams = [ngram for ngram in curr_sent_ngrams]
    elif n == 3:
        curr_sent_ngrams = nltk.trigrams(words)
        curr_sent_ngrams = [ngram for ngram in curr_sent_ngrams]
    for ngram in ngram_tuples:
        if ngram in curr_sent_ngrams:
            is_in_sent.append(True)
        else:
            is_in_sent.append(False)
    
    for i in range(0, len(words)):
        tmp = []
        flag = True
        feats = feats.append([is_in_sent])
        if n == 2:
            for tup in ngram_tuples:
                for j in range(0, i):
                    if i == 0:
                        flag = False
                    elif i == 1:
                        flag = False
                    elif i >= 2:
                        if j+2 < i:
                            if (words[j], words[j+1]) == tup:
                                flag = True
                            else:
                                flag = False
                tmp.append(flag)
        
        elif n == 3:
            for tup in ngram_tuples:
                for j in range(0, i):
                    if i == 0:
                        flag = False
                    elif i == 1:
                        flag = False
                    elif i == 2:
                        flag = False
                    elif i >= 3:
                        if j+3 < i:
                            if(words[j], words[j+1], words[j+2]) == tup:
                                flag = True
                            else:
                                flag = False
                tmp.append(flag)
        is_predecessor.append(tmp)
        
    print(len(words))
    
def tf_idf_ngrams_all(sentences, df_subset, ngrams):
    feats = pd.DataFrame()
    for i in range(0, len(sentences)):
        words_curr_sent = df_subset[df_subset["Sentence"] == "Sentence " + str(i)]["Token"]
        ngram_feats = tf_idf_ngrams(ngrams, words_curr_sent)
        break
        #feats = feats.append([ngram_feats])
    feats.columns = []
    #return feats

In [170]:
tf_idf_ngrams_all(sentences, tmp[["Sentence", "Token"]], bigrams)

44


In [92]:
bigrams

[(('looking', 'for'), 581),
 (('of', 'the'), 365),
 (('i', 've'), 362),
 (('movies', 'that'), 349),
 (('movies', 'like'), 286),
 (('im', 'looking'), 262),
 (('in', 'the'), 249),
 (('it', 's'), 193),
 (('to', 'watch'), 190),
 (('a', 'movie'), 186),
 (('for', 'a'), 182),
 (('movies', 'with'), 178),
 (('and', 'i'), 176),
 (('and', 'the'), 165),
 (('but', 'i'), 162),
 (('similar', 'to'), 159),
 (('would', 'be'), 157),
 (('like', 'the'), 149),
 (('movies', 'i'), 145),
 (('that', 'are'), 144),
 (('i', 'have'), 142),
 (('that', 'i'), 141),
 (('to', 'be'), 140),
 (('i', 'am'), 139),
 (('for', 'movies'), 139),
 (('i', 'love'), 126),
 (('ve', 'seen'), 123),
 (('any', 'suggestions'), 122),
 (('i', 'really'), 119),
 (('movies', 'where'), 118)]

In [None]:
def lexical_feats(sentence, words):
    negations = []
    specific patterns = []
    most_freq_bigrams = []
    most_freq_trigrams = []
    opening_movie_indics = []
    closing_movie_indics = []
    stopwords = []
    
    return list()

def lexical_feats_all():
    
    return list()

In [124]:
pos_tags = pos_tag(words)
pattern = 'NP: {<DT>?<JJ>*<NN>}'
chunker = RegexpParser(pattern)
chunks = chunker.parse(tags)
chunk_list = []
tagged_chunks = tree2conlltags(chunks)
for c in tagged_chunks:
    print(c)

('Im', 'NNP', 'O')
('looking', 'VBG', 'O')
('for', 'IN', 'O')
('movies', 'NNS', 'O')
('feature', 'VBP', 'O')
('disfunctional', 'JJ', 'O')
('unlikable', 'JJ', 'O')
('protagonists', 'NNS', 'O')
('who', 'WP', 'O')
('are', 'VBP', 'O')
('toxic', 'JJ', 'O')
('Movies', 'NNS', 'O')
('like', 'IN', 'O')
('Nightcrawler', 'NNP', 'O')
('Little', 'NNP', 'O')
('Miss', 'NNP', 'O')
('Sunshine', 'NNP', 'O')
('Taxi', 'NNP', 'O')
('Driver', 'NNP', 'O')
('There', 'EX', 'O')
('Will', 'NNP', 'O')
('Be', 'NNP', 'O')
('Blood', 'NNP', 'O')
('Hesher', 'NNP', 'O')
('Wolf', 'NNP', 'O')
('of', 'IN', 'O')
('Wall', 'NNP', 'O')
('Street', 'NNP', 'O')
('or', 'CC', 'O')
('Closer', 'NNP', 'O')
('where', 'WRB', 'O')
('the', 'DT', 'B-NP')
('movie', 'NN', 'I-NP')
('s', 'JJ', 'B-NP')
('protagonist', 'NN', 'I-NP')
('s', 'NN', 'B-NP')
('is', 'VBZ', 'O')
('completely', 'RB', 'O')
('destructive', 'JJ', 'O')
('to', 'TO', 'O')
('them', 'PRP', 'O')
('self', 'PRP', 'O')
('and', 'CC', 'O')
('those', 'DT', 'O')
('around', 'IN', 'O')
(