In [1]:
# Imports
import os
import sys
import pandas as pd
import nltk
import string
import numpy as np
import re
from collections import Counter
import string
from nltk.corpus import stopwords

from nltk import pos_tag
from nltk import RegexpParser
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.sentiment import SentimentIntensityAnalyzer

import spacy
from spacy import displacy
from sklearn.feature_extraction.text import TfidfVectorizer
import en_core_web_sm
from nltk import FreqDist
nlp = en_core_web_sm.load()

In [2]:
def read_data():
    # Define path
    os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_preprocessed')

    # Read datasets
    submissions = pd.read_csv("train_submissions_simplified_new_line.csv")
    #submissions_tokenized = pd.read_csv("submissions_tokenized_v2.csv") # Tokenized with removed punctuation
    submissions_tokenized = pd.read_csv("train_submissions_tokenized_final_new_line.csv") # Tokenized without removed punctuation
    
    # Add an additional integer column to keep track of the current sentence, we need this for grouping the tokens
    sentence_nums = []
    sentence_nums = [int(sent.split(" ")[1]) for sent in submissions_tokenized["Sentence"]]
    submissions_tokenized["Sentence_index"] = pd.Series(sentence_nums)
    
    return submissions, submissions_tokenized

In [3]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, c, s) for w, p, c, s in zip(s["Words"].values.tolist(),
                                                                 s["POS_tag"].values.tolist(),
                                                                 s["Chunk_tag"].values.tolist(),
                                                                 s["sent_id"].values.tolist())]
        self.grouped = self.data.groupby("Sentence_index").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped
            self.n_sent += 1
            return s
        except:
            return None

In [4]:
def group_sents(submissions_tokenized):
    getter = SentenceGetter(submissions_tokenized)
    sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
    labels = [[s[1] for s in sentence] for sentence in getter.sentences]
    sent_ids = [[s_id[3] for s_id in sentence] for sentence in getter.sentences]
    
    return sentences, labels, sent_ids

All possible features we can draw out from spacy
https://spacy.io/api/doc

In [5]:
def spacy_feats_sentence(index, sentence, sent_id):
    data = pd.DataFrame()
    string = "Sentence " + str(index)
    whole_sent = " ".join(sentence)
    doc = nlp(whole_sent)
    
    for X in doc:
        data = data.append([[sent_id[0], string, X.i, X.text, X.tag, X.pos, X.ent_type, X.ent_iob, X.lemma, X.norm, X.shape, X.lex_id, 
                             X.is_digit, X.is_ascii, X.is_alpha, X.is_punct, X.is_left_punct, X.is_right_punct, X.rank, 
                             X.is_bracket, X.is_space, X.is_quote, X.is_currency, X.is_stop, X.dep, X.lang, X.prob, X.sentiment, 
                             X.is_lower, X.is_upper, X.like_num, X.is_oov, X.n_lefts, X.n_rights, X.is_sent_start, 
                             X.has_vector, X.ent_kb_id, X.ent_id, X.lower, X.prefix, X.suffix, X.idx, X.cluster, len(X)]])
       
    data.columns = ['Sent_id', 'Sentence','Token_index','Token','POS_tag', 'POS_universal', 'NER_tag','NER_iob','lemma','norm',
                    'shape', 'lex_id', 'is_digit', 'is_ascii', 'is_alpha', 'is_punct', 'is_left_punct', 
                    'is_right_punct', 'rank', 'is_bracket', 'is_space', 'is_quote', 'is_currency', 'stopword', 'dependency', 
                    'language', 'log_prob', 'sent', 'is_lower', 'is_upper', 'like_num', 'out_of_vocab', 'num_lefts', 
                    'num_rights', 'sent_start', 'has_vector', 'knowledge_base', 'id_entity', 'lower', 
                    'prefix', 'suffix', 'chr_offset', 'brown_cluster', 'num_chars']
    return data

def spacy_feats_all(sentences, sent_ids):
    data = pd.DataFrame()
    
    for i in range(0, len(sentences)):
        tmp_df = spacy_feats_sentence(i, sentences[i], sent_ids[i])
        data = data.append([tmp_df])
    data.columns = ['Sent_id','Sentence','Token_index','Token','POS_tag', 'POS_universal', 'NER_tag','NER_iob','lemma','norm',
                    'shape', 'lex_id', 'is_digit', 'is_ascii', 'is_alpha', 'is_punct', 'is_left_punct', 
                    'is_right_punct', 'rank', 'is_bracket', 'is_space', 'is_quote', 'is_currency', 'stopword', 'dependency', 
                    'language', 'log_prob', 'sent', 'is_lower', 'is_upper', 'like_num', 'out_of_vocab', 'num_lefts', 
                    'num_rights', 'sent_start', 'has_vector', 'knowledge_base', 'id_entity', 'lower', 
                    'prefix', 'suffix', 'chr_offset', 'brown_cluster', 'num_chars']

    return data

In [6]:
def spacy_feats_tensors(index, sentence):
    data = pd.DataFrame()
    string = "Sentence " + str(index)
    whole_sent = " ".join(sentence)
    doc = nlp(whole_sent)
    for X in doc:
        tensors = [item for item in X.tensor]
        data = data.append([tensors])
    return data

def spacy_feats_tensors_all(sentences):
    data = pd.DataFrame()
    
    for i in range(0, len(sentences)):
        tmp_df = spacy_feats_tensors(i, sentences[i])
        data = data.append([tmp_df])  
    tensors = ["Vector_ " + str(i) for i in range(0, 96)]
    data.columns = [tensors]
    return data

Various sentiment features with the pretrained Vader Sentiment Analyser from NLTK

In [7]:
def sentiment_feats(sentence, words):
    all_sentence_scores = pd.DataFrame()
    words = words.tolist()
    whole_sent = " ".join(sentence)
    analyzer = SentimentIntensityAnalyzer()
    s_neg_score, s_neu_score, s_pos_score, s_compound_score = analyzer.polarity_scores(whole_sent).values()
    for i in range(0, len(words)):
        w_neg_score, w_neu_score, w_pos_score, w_compound_score = analyzer.polarity_scores(words[i]).values()
        if s_compound_score == 0:
            s_compound_score = 0.0001
        sent_ratio = w_compound_score/s_compound_score        
        if i == 0:
            predecessor_sent = 0
        else:
            _,_,_,predecessor_sent = analyzer.polarity_scores(words[i-1]).values()
        if i < len(words)-1:
            _,_,_,successor_sent = analyzer.polarity_scores(words[i+1]).values()
        else:
            successor_sent = 0
        all_sentence_scores = all_sentence_scores.append([[s_compound_score, w_neg_score, w_neu_score, w_pos_score, w_compound_score, sent_ratio, predecessor_sent, successor_sent]])
    all_sentence_scores.columns = ["Sentence_sent", "Neg_sent_score", "Neu_sent_score", "Pos_sent_score", "Sent_score", "word_to_sentence_sent_ratio", "prev_word_sent", "next_word_sent"]
    return all_sentence_scores

def sentiment_feats_all(sentences, df_subset):
    feats = pd.DataFrame()
    for i in range(0, len(sentences)):
        words_curr_sent = df_subset[df_subset["Sentence"] == "Sentence " + str(i)]["Token"]
        new_feats = sentiment_feats(sentences[i], words_curr_sent)
        feats = feats.append([new_feats])
    feats.columns = ["Sentence sent", "Neg_sent_score", "Neu_sent_score", "Pos_sent_score", "Sent_score", "word_to_sentence_sent_ratio", "prev_word_sent", "next_word_sent"]
    return feats

Term frequency features

In [8]:
def tf_feats(sentences, df_subset):
    feats = pd.DataFrame()
    freqdist = FreqDist(df_subset["Token"])
    total_word_count = sum(freqdist.values())
    all_freqs = freqdist.most_common()
    for word in df_subset["Token"]:
        abs_frequency = freqdist[word]
        normalized_frequency = freqdist[word] / total_word_count
        feats = feats.append([[normalized_frequency, abs_frequency]])
    feats.columns = ["norm_freq", "abs_freq"]
    return feats

Bigrams and trigrams frequency 

In [9]:
def extract_bigrams_trigrams(sentences):
    stops = stopwords.words('english')
    stops.append('new_line')
    all_sentences = []
    for sentence in sentences:
        all_sentences.append(" ".join(sentence))
    vectorizer_bigrams = TfidfVectorizer(analyzer = "word", ngram_range=(2, 2), tokenizer = None, preprocessor = None, 
                                 stop_words = stops, max_features = 25, max_df = 0.9) 
    vectorizer_trigrams = TfidfVectorizer(analyzer = "word", ngram_range=(3, 3), tokenizer = None, preprocessor = None, 
                                 stop_words = stops, max_features = 25, max_df = 0.9) 
    feats_bigrams = vectorizer_bigrams.fit_transform(all_sentences)
    feats_trigrams = vectorizer_trigrams.fit_transform(all_sentences)
    bigrams = vectorizer_bigrams.get_feature_names()
    trigrams = vectorizer_trigrams.get_feature_names()
    return bigrams, trigrams

In [10]:
def tf_ngrams(ngrams, words):
    feats = pd.DataFrame()
    n = len(ngrams[0].split(" "))
    words = words.tolist()
    is_predecessor = list()
    
    if n == 2:
        curr_sent_ngrams = nltk.bigrams(words)
        curr_ngrams = [ngram for ngram in curr_sent_ngrams] 
    elif n == 3:
        curr_sent_ngrams = nltk.trigrams(words)
        curr_ngrams = [ngram for ngram in curr_sent_ngrams]     
         
    for ngram in ngrams:
        splits = ngram.split(" ")
        tmp = []
        if n == 2:
            for i in range(0, len(words)):
                counter = 0
                for j in range(0, i):
                    if i >= 2:
                        if j+2 < i:
                            if (words[j], words[j+1]) == splits:
                                counter += 1
                tmp.append(counter)
        elif n == 3:
            for i in range(0, len(words)):
                counter = 0
                for j in range(0, i):
                    if i >= 3:
                        if j+3 < i:
                            if(words[j], words[j+1], words[j+2]) == splits:
                                counter += 1
                tmp.append(counter)
        is_predecessor.append(tmp)
    index = 1
    for p in is_predecessor:
        feats[str(index)] = pd.Series(p)
        index = index + 1
    return(feats)
    
def tf_ngrams_all(sentences, df_subset, ngrams):
    feats = pd.DataFrame()
    for i in range(0, len(sentences)):
        words_curr_sent = df_subset[df_subset["Sentence"] == "Sentence " + str(i)]["Token"]
        ngram_feats = tf_ngrams(ngrams, words_curr_sent)
        feats = feats.append([ngram_feats])
    names = [str(ngram) + "_is_predecessor" for ngram in ngrams]
    feats.columns = [names]
    return feats

Extract all features

In [11]:
# Read data: submissions has the preserved form od the reddit text, submissions_tokenized contains the tokenized 
# sentences. We need both to preserve the sentence order when extracting the features
submissions, submissions_tokenized = read_data()

# Submissions_tokenized contain the entire dataset, we will also need the tokens from each sentence separate, so 
# we group them
sentences, labels, sent_ids = group_sents(submissions_tokenized)

In [15]:
submissions.text[1]

"I'm looking for a thought provoking, dark, suspenseful movie. I'm looking for a movie that really makes you think and takes you away from reality to where you're just thinking about how the movie is going to play out. I like the kind of movies that don't make too much sense at first, which adds to the thinking aspect of them. I loved Shutter Island because of the ending, so if you have any movies that have sort of an eerie feeling with suspense and a good plot to them would be awesome. Thanks in advance."

In [20]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Looking for a thought provoking, dark, suspenseful movie.")

for token in doc:
    print(token.text, token.tag_, token.pos_, token.pos, token.shape_, 
          token.is_alpha, token.is_stop)

Looking VBG VERB 100 Xxxxx True False
for IN ADP 85 xxx True True
a DT DET 90 x True True
thought NN NOUN 92 xxxx True False
provoking JJ ADJ 84 xxxx True False
, , PUNCT 97 , False False
dark JJ ADJ 84 xxxx True False
, , PUNCT 97 , False False
suspenseful JJ ADJ 84 xxxx True False
movie NN NOUN 92 xxxx True False
. . PUNCT 97 . False False


In [126]:


# Extract different sorts of features
feats1 = spacy_feats_all(sentences, sent_ids)
print("Extracted 1st group of features")

feats2 = spacy_feats_tensors_all(sentences)
print("Extracted 2nd group of features")

feats3 = sentiment_feats_all(sentences, feats1[["Sentence", "Token"]])
print("Extracted 3rd group of features")

feats4 = tf_feats(sentences, feats1[["Sentence", "Token"]])
print("Extracted 4th group of features")

#bigrams, trigrams = extract_bigrams_trigrams(sentences)
#feats5 = tf_ngrams_all(sentences, feats1[["Sentence", "Token"]], bigrams)
#print("Extracted 5th group of features")

#feats6 = tf_ngrams_all(sentences, feats1[["Sentence", "Token"]], trigrams)
#print("Extracted 6th group of features")

Extracted 1st group of features
Extracted 2nd group of features
Extracted 3rd group of features
Extracted 4th group of features


In [127]:
feats = pd.concat([feats1, feats2, feats3, feats4], axis = 1)

In [128]:
feats.to_csv("../Reddit_features/train_submissions_features_v2_num_FINALL_new_sent.csv", index = False)