In [58]:
# Imports
import pandas as pd
import os
import string

In [71]:
# Define path
os.chdir('D:/TU_Graz/Thesis/Datasets/MIT_movie_corpus')

# Read data
train = pd.read_csv("mit_movie_train.tsv", sep='\t')
train.columns = ['bio_tag', 'Words']
test = pd.read_csv("mit_movie_test.tsv", sep='\t')
test.columns = ['bio_tag', 'Words']

In [72]:
def sentence_indicators(dataframe):
    sentence_indicators = []
    index_word = 0
    index_sentence = 0
    for word in dataframe['bio_tag']:
        if word == 'NEW_SENT':
            sentence_indicators.append("NEW_SENT")
            index_sentence += 1
        else:
            sentence_indicators.append("Sentence " + str(index_sentence))
        index_word += 1
            
    dataframe['Sentence'] = pd.Series(sentence_indicators)
    dataframe = dataframe[dataframe.bio_tag != 'NEW_SENT']
    dataframe = dataframe.reindex(columns = ['Sentence', 'Words', 'bio_tag'])
    return dataframe

In [73]:
train = sentence_indicators(train)
test = sentence_indicators(test)

We check all the words identified as genres with the IMDB genres list. Since we are only interested in IMDB genres, the ones that are not in the list we discard

In [74]:
imdb_genres = pd.read_csv("../IMDB_database/genres.csv", sep=';')
imdb_genres = list(set(imdb_genres['genrename']))
imdb_genres = [g.lower() for g in imdb_genres]
imdb_genres.extend(['scifi', 'science-fiction', 'science fiction', 'sci fi', 'film noir', 'noir', 'noir film'])

In [75]:
def filter_genres(dataset, genres):
    for index, row in dataset.iterrows():
        if row['bio_tag'] in ["B-GENRE", "I-GENRE"]:
            if str(row['Words']).lower() not in genres:
                #Replace row['bio_tag'] with 'O'
                #row['bio_tag'] = 'O'
                dataset.loc[index]['bio_tag'] = 'O'
    return dataset

In [76]:
def filter_tags(dataset):
    tag_list = ["B-GENRE", "I-GENRE", "B-ACTOR", "I-ACTOR", "B-MOVIE", "I-MOVIE", "B-KEYWORD", "I-KEYWORD", "O"]
    for index, row in dataset.iterrows():
        if row['bio_tag'] == "B-TITLE":
            dataset.loc[index]['bio_tag'] = "B-MOVIE"
        elif row['bio_tag'] == "I-TITLE":
            dataset.loc[index]['bio_tag'] = "I-MOVIE"
        elif row['bio_tag'] == "B-DIRECTOR":
            dataset.loc[index]['bio_tag'] = "B-ACTOR"
        elif row['bio_tag'] == "I-DIRECTOR":
            dataset.loc[index]['bio_tag'] = "I-ACTOR"
        elif row['bio_tag'] in ["B-PLOT", "B-CHARACTER", "B-YEAR"]:
            dataset.loc[index]['bio_tag'] = 'B-KEYWORD'
        elif row['bio_tag'] in ["I-PLOT", "I-CHARACTER", "I-YEAR"]:
            dataset.loc[index]['bio_tag'] = "I-KEYWORD"
        elif row['bio_tag'] not in tag_list:
            dataset.loc[index]['bio_tag'] = "O"
    return dataset

In [77]:
data_train = filter_tags(filter_genres(train, imdb_genres))
data_test = filter_tags(filter_genres(test, imdb_genres))

In [66]:
from nltk import pos_tag
from nltk import RegexpParser
from nltk.chunk import conlltags2tree, tree2conlltags

In [67]:
def add_tags(data):
    tags = pos_tag(data.Words)
    tags_final = [t[1] for t in tags]
    pattern = 'NP: {<DT>?<JJ>*<NN>}'
    chunker = RegexpParser(pattern)
    chunks = chunker.parse(tags)
    tagged_chunks = tree2conlltags(chunks)
    chunks_final = [t[2] for t in tagged_chunks]
    
    return tags_final, chunks_final

In [68]:
pos_train, chunk_train = add_tags(data_train)
pos_test, chunk_test = add_tags(data_test)

In [69]:
data_train['POS_tag'] = pd.Series(pos_train)
data_train['Chunk_tag'] = pd.Series(chunk_train)
data_test['POS_tag'] = pd.Series(pos_test)
data_test['Chunk_tag'] = pd.Series(chunk_test)

In [79]:
data_train.to_csv("../MIT_movie_corpus_preprocessed/mit_train.csv", index = False)
data_test.to_csv("../MIT_movie_corpus_preprocessed/mit_test.csv", index = False)