In [6]:
# Imports
import os
import sys
import pandas as pd
import nltk
import string
import re

from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import regexp_tokenize
from nltk import pos_tag
from nltk import RegexpParser
from nltk.chunk import conlltags2tree, tree2conlltags

In [7]:
def read_data():
    # Define path
    os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_original')

    # Read datasets
    submissions = pd.read_csv("submissions.csv", sep = ';')
    movie_titles = pd.read_csv("movie_titles.csv", sep = ';')
    test_set_ids_list = pd.read_csv("submission_ids_test_set.txt")
    test_set_ids_list = list(test_set_ids_list["5c1dp2"])
    submissions = submissions.fillna("")
    
    return submissions, movie_titles, test_set_ids_list

The idea is to make a dataset with only columns for the submission text (submission title and text concatenated), the movies, actors and genres identified in the submission. (We'll be interested whether they are positive or negative later).

Make new lists of positive and negative movies

In [8]:
def make_lists(dataframe, column):
    movies = []
    for line in submissions[column]:
        parts = line.split('|')
        movies_sub = ""
        for index, row in movie_titles.iterrows():
            if row['movie_id'] in parts:
                movies_sub = movies_sub + "|" + row['movie_title']
        movies.append(movies_sub[1:])
    return movies

Fill the new dataset with data  

In [9]:
def make_df(submissions, pos_movies, neg_movies):
    sub_modified = pd.DataFrame()
    sub_modified['text'] = submissions['reddit_submission_title'] + " " + submissions['reddit_submission_text'] 
    sub_modified['genres'] = submissions['positive_genres'] + '|' + submissions['negative_genres']
    sub_modified['actors'] = submissions['positive_actors'] + '|' + submissions['negative_actors']
    sub_modified['movies'] = pd.Series(pos_movies) + '|' + pd.Series(neg_movies)
    sub_modified['keywords'] = submissions['positive_keywords'] + '|' + submissions['negative_keywords']
    
    return sub_modified

Some entries might end with '|' because of the concatenation, or if no words were detected, just '|' stands, so that should be fixed.

In [10]:
def fix_vertical_line(df, column):
    col = []
    for line in sub_modified[column]:
        if len(line) == 1:
            line = ""
        if line.startswith('|'):
            line = line[1:]
        if line.endswith('|'):
            line = line[:-1]
        col.append(line)
    return col

To reduce redundant and not informative words in the text, we remove the URL's and the opening tags like: [Request] which indicate that it's a request.

In [11]:
def preprocess_sentences(sub_modified):
    sentences = []
    for index, row in sub_modified.iterrows():
        text = re.sub("<br/>", " NEW_LINE ", row['text'])
        text = re.sub("<br>", " NEW_LINE ", text)
        text = re.sub("-", "- ", text)
        url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = re.sub(url_regex, "", text)
        text = re.sub("[\[\{\(][Rr][Ee][Qq][\w]*[\]\}\)]", "", text)
        text = re.sub("[\[\{\(][Ss][Uu][Gg][Gg][Ee][Ss][Tt][/\w]*[\]\}\)]", "", text)
        text = re.sub("^ ", "", text)
        #tokens = WordPunctTokenizer().tokenize(text)
        sentences.append(text)
        #print(tokens)
        
        return sentences

In [12]:
def split_based_on_id(data_simplfied):
    # Split the dataset so that we can get a test dataset we'll continuosly use based on previously defined sub. ids
    train = pd.DataFrame()
    test = pd.DataFrame()
    for index, row in data_simplified.iterrows():
        if row['id'] in test_set_ids_list:
            test = test.append([row])
        else:
            train = train.append([row])
    train.columns = data_simplified.columns
    test.columns = data_simplified.columns
    
    return train, test

Tokenize the sentences, perform part-of-speech tagging with nltk pos tagger and create a list of words and tags.

In [14]:
def tokenizer(data):
    sentences_tokenized = []
    splitted_words_all = []
    pos_tags = []
    chunk_list = []
    curr_id = []
    for index, row in data.iterrows(): 
        #text = re.sub("I'm", "Im", row['text']) i'm, i'll, I'll, We're, we're, you're, You're, They're, they'll, don't..
        #tokens = WordPunctTokenizer().tokenize(row['text'])
        #tokens = nltk.word_tokenize(row["text"])
    
        # forms tokens out of alphabetic sequences, money expressions, and any other non-whitespace sequences
        tokens = regexp_tokenize(row['text'], pattern = '\w+|\$[\d\.]+|\S+') 
    
        # forms tokens with removing the punctuation
        #tokens = regexp_tokenize(row['text'], pattern = "\w+")
        tags = pos_tag(tokens)
    
        pattern = 'NP: {<DT>?<JJ>*<NN>}'
        chunker = RegexpParser(pattern)
        chunks = chunker.parse(tags)
        tagged_chunks = tree2conlltags(chunks)

        for token in tokens:
            splitted_words_all.append(token)
            curr_id.append(row['id'])
        sentences_tokenized.append(tokens)
        for tag in tags:
            pos_tags.append(tag[1])
        for chunk in tagged_chunks:
            chunk_list.append(chunk[2])
    
    sentence_indicators = []
    index = 0
    for sentence in sentences_tokenized:
        for word in sentence:
            sentence_indicators.append("Sentence " + str(index))
        index = index + 1
        
    t = {'Sentence' : pd.Series(sentence_indicators),
         'sent_id' : pd.Series(curr_id),
         'Words' : pd.Series(splitted_words_all),
         'POS_tag' : pd.Series(pos_tags),
         'Chunk_tag' : pd.Series(chunk_list)
        }
    
    final_data = pd.DataFrame(t)
    return final_data

In [3]:
# Read the data
submissions, movie_titles, test_set_ids_list = read_data()

# Make positive and negative lists of movies
pos_movies = make_lists(submissions, 'positive_movie_ids')
neg_movies = make_lists(submissions, 'negative_movie_ids')

# Make a new updated dataset
sub_modified = make_df(submissions, pos_movies, neg_movies)

# Fix the vertical line for all entities
genres = fix_vertical_line(sub_modified, 'genres')
actors = fix_vertical_line(sub_modified, 'actors')
movies = fix_vertical_line(sub_modified, 'movies')
keywords = fix_vertical_line(sub_modified, 'keywords')

# Preprocess sentences
sentences = preprocess_sentences(sub_modified)

# Create new dataset
d = {'id' : pd.Series(submissions["reddit_submission_id"]),
     'text' : pd.Series(sentences),
     'movies' : pd.Series(movies),
     'pos_movies' : pd.Series(pos_movies),
     'neg_movies' : pd.Series(neg_movies),
     'genres' : pd.Series(genres),
     'pos_genres' : submissions['positive_genres'],
     'neg_genres' : submissions['negative_genres'],
     'actor' : pd.Series(actors),
     'pos_actor' : submissions['positive_actors'],
     'neg_actor' : submissions['negative_actors'],
     'keywords' : pd.Series(keywords),
     'pos_keywords' : submissions['positive_keywords'],
     'neg_keywords' : submissions['negative_keywords']}
data_simplified = pd.DataFrame(d)   
data_simplified.to_csv("../Reddit_preprocessed/submissions_simplified.csv",index = False)

# Split to train and test sets
train, test = split_based_on_id(data_simplified)

# Save the train and test datasets
train.to_csv("../Reddit_preprocessed/train_submissions_simplified_new_line.csv", index = False)
test.to_csv("../Reddit_preprocessed/test_submissions_simplified_new_line.csv", index = False)

# Tokenize and pos tag etc. all of the data, and separately the train and test data
data_tokenized = tokenizer(data)
train_tokenized = tokenizer(train)
test_tokenized = tokenizer(test)

# ...and save them all
data_tokenized.to_csv("../Reddit_preprocessed/submissions_tokenized_final_new_line.csv", index = False)
train_tokenized.to_csv("../Reddit_preprocessed/train_submissions_tokenized_final_new_line.csv", index = False)
test_tokenized.to_csv("../Reddit_preprocessed/test_submissions_tokenized_final_new_line.csv", index = False)