In [1]:
# Imports
import os
import sys
import pandas as pd
import nltk
import string
import re

In [2]:
# Define path
os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_original')

# Read datasets
submissions = pd.read_csv("submissions.csv", sep = ';')
movie_titles = pd.read_csv("movie_titles.csv", sep = ';')
comments = pd.read_csv("comments.csv", sep = ';')

submissions = submissions.fillna("")
comments = comments.fillna("")

In [3]:
submissions.head()

Unnamed: 0,reddit_submission_id,reddit_submission_title,reddit_submission_text,positive_movie_ids,negative_movie_ids,positive_keywords,negative_keywords,positive_genres,negative_genres,positive_actors,negative_actors
0,10gvsh,Modern noir movies?,Hi. I'm looking for 'noir' movies made in 2000...,tt0393109,,modern|noir,,,,,
1,10tqw6,"I'm looking for a thought provoking, dark, sus...",I'm looking for a movie that really makes you ...,tt1130884,,thought|provoking|dark|suspenseful|movie|and|to,,,,,
2,10u1ce,space-mystery-thriller-type movies?,"I just watched all the Alien movies, Predators...",tt0078748|tt0090605|tt0103644|tt0118583|tt0119...,,space,,Mystery|Thriller,,,
3,1106mj,can anyone suggest me smart intelligent movie,"i'm looking for a movie like ""inception"" or ""s...",tt1375666|tt1515091,,smart|intelligent,,,,,
4,11fhdf,Looking for movies that will make you cry.,Lately I've been looking for a lot of tearjerk...,tt0095327,,tearjerkers|tragedy|cry,,,Romance,,


The idea is to make a dataset with only columns for the submission text (submission title and text concatenated), the movies, actors and genres identified in the submission. (We'll be interested whether they are positive or negative later).

Make new lists of positive and negative movies

In [4]:
pos_movies=[]
for line in submissions['positive_movie_ids']:
    parts = line.split('|')
    movies_sub = ""
    for index, row in movie_titles.iterrows():
        if row['movie_id'] in parts:
            movies_sub = movies_sub + "|" + row['movie_title']
    pos_movies.append(movies_sub[1:])

neg_movies=[]
for line in submissions['negative_movie_ids']:
    parts = line.split('|')
    movies_sub = ""
    for index, row in movie_titles.iterrows():
        if row['movie_id'] in parts:
            movies_sub = movies_sub + "|" + row['movie_title']
    neg_movies.append(movies_sub[1:])

Fill the new dataset with data  

In [5]:
sub_modified = pd.DataFrame()
sub_modified['text'] = submissions['reddit_submission_title'] + " " + submissions['reddit_submission_text'] 
sub_modified['genres'] = submissions['positive_genres'] + '|' + submissions['negative_genres']
sub_modified['actors'] = submissions['positive_actors'] + '|' + submissions['negative_actors']
sub_modified['movies'] = pd.Series(pos_movies) + '|' + pd.Series(neg_movies)
sub_modified['keywords'] = submissions['positive_keywords'] + '|' + submissions['negative_keywords']

Some entries might end with '|' because of the concatenation, or if no words were detected, just '|' stands, so that should be fixed.

In [6]:
genres = []
for line in sub_modified['genres']:
    if len(line) == 1:
        line = ""
    if line.startswith('|'):
        line = line[1:]
    if line.endswith('|'):
        line = line[:-1]
    genres.append(line)

In [7]:
actors = []
for line in sub_modified['actors']:
    if len(line) == 1:
        line = ""
    if line.startswith('|'):
        line = line[1:]
    if line.endswith('|'):
        line = line[:-1]
    actors.append(line)

In [8]:
movies = []
for line in sub_modified['movies']:
    if len(line) == 1:
        line = ""
    if line.startswith('|'):
        line = line[1:]
    if line.endswith('|'):
        line = line[:-1]
    movies.append(line)

In [9]:
keywords = []
for line in sub_modified['keywords']:
    if len(line) == 1:
        line = ""
    if line.startswith('|'):
        line = line[1:]
    if line.endswith('|'):
        line = line[:-1]
    keywords.append(line)

In [33]:
#Preprocess
sentences = []
for index, row in sub_modified.iterrows():
    text = re.sub("<br/>", " ", row['text'])
    text = re.sub("<br>", " ", text)
    text = re.sub("[\[\{\(][Rr][Ee][Qq][Uu][Ee][Ss][Tt]*[\]\}\)]", "", text)
    text = re.sub("[\[\{\(][Ss][Uu][Gg][Gg][Ee][Ss][Tt]*[\]\}\)]", "", text)
    """
    text = text.replace("[req]", "") 
    text = text.replace("[REQ]", "") 
    text = text.replace("[Req]", "") 
    text = text.replace("REQUEST:", "")
    text = text.replace("Request:", "")
    text = text.replace("(Suggestion)", "")
    text = text.replace("[Suggestion]", "")
    text = text.replace("[SUGGESTION/REQUEST]", "")
    text = text.replace("[SUGGEST, PLEASE]", "")
    text = text.replace("[SUGGEST/REQUEST]", "")
    text = text.replace("[REQUEST/SUGGEST]", "")
    text = text.replace("(Request/Suggestions)", "") 
    text = text.replace("[suggestions]", "")
    text = text.replace("[SUGGESTION]", "")
    """
    url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = re.sub(url_regex,"URL_HERE",text)
    #print(text)
    #tokens = WordPunctTokenizer().tokenize(text)
    sentences.append(text)
    #print(tokens)

In [34]:
sentences

['Modern noir movies? Hi. I\'m looking for \'noir\' movies made in 2000 or later.  I don\'t know if it is really noir. But you know, detective kind of stories, maybe dark, the character narrates his thoughts and so on....  Something like "Brick".',
 "I'm looking for a thought provoking, dark, suspenseful movie. I'm looking for a movie that really makes you think and takes you away from reality to where you're just thinking about how the movie is going to play out. I like the kind of movies that don't make too much sense at first, which adds to the thinking aspect of them. I loved Shutter Island because of the ending, so if you have any movies that have sort of an eerie feeling with suspense and a good plot to them would be awesome. Thanks in advance.",
 "space-mystery-thriller-type movies? I just watched all the Alien movies, Predators (underrated), Event Horizon and Prometheus this week. I'm on a space thriller kick and I'm running out of ideas...  Any suggestions?  Edit: Forgot to ad

In [30]:
for line in sub_modified["text"]:
    print(line)

Modern noir movies? Hi. I'm looking for 'noir' movies made in 2000 or later.<br/><br/>I don't know if it is really noir. But you know, detective kind of stories, maybe dark, the character narrates his thoughts and so on....<br/><br/>Something like "Brick".
I'm looking for a thought provoking, dark, suspenseful movie. I'm looking for a movie that really makes you think and takes you away from reality to where you're just thinking about how the movie is going to play out. I like the kind of movies that don't make too much sense at first, which adds to the thinking aspect of them. I loved Shutter Island because of the ending, so if you have any movies that have sort of an eerie feeling with suspense and a good plot to them would be awesome. Thanks in advance.
space-mystery-thriller-type movies? I just watched all the Alien movies, Predators (underrated), Event Horizon and Prometheus this week. I'm on a space thriller kick and I'm running out of ideas...<br/><br/>Any suggestions?<br/><br/>

In [30]:
data_final = pd.DataFrame()
data_final['text'] = sub_modified['text']

data_final['movies'] = pd.Series(movies)
data_final['pos_movies'] = pd.Series(pos_movies)
data_final['neg_movies'] = pd.Series(neg_movies)

data_final['genres'] = pd.Series(genres)
data_final['pos_genres'] = submissions['positive_genres']
data_final['neg_genres'] = submissions['negative_genres']

data_final['actor'] = pd.Series(actors)
data_final['pos_actors'] = submissions['positive_actors']
data_final['neg_actors'] = submissions['negative_actors']

data_final['keywords'] = pd.Series(keywords)
data_final['pos_keywords'] = submissions['positive_keywords']
data_final['neg_keywords'] = submissions['negative_keywords']

In [None]:
data_final.to_csv("submissions_simplified.csv",index = False)

In [None]:
#os.chdir('D:/TU_Graz/Thesis/Datasets/GENERATED NEW DATA')

#data = pd.read_csv("submissions_simplified.csv")
#data_new = pd.DataFrame()

Removing some redundant text from the submissions indicating it's a request

In [None]:
data_new["text"] = pd.Series(sentences)
data_new["movies"] = data["movies"]
data_new["genres"] = data["genres"]
data_new["actor"] = data["actor"]

In [None]:
data_new.head()

In [None]:
data_new.to_csv("submissions_simplified.csv", index = False)