In [1]:
#Imports
import os
import pandas as pd
import string
import numpy as np
from textblob import Word

In [2]:
def read_data():
    os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_features')
    data = pd.read_csv("test_submissions_features_FINALL_NS.csv")
    submissions = pd.read_csv("../Reddit_preprocessed/test_submissions_simplified_NL.csv")
    submissions = submissions.fillna("")
    movies_matched = pd.read_csv("../IMDB_database/movies_matched.csv")
    
    return data, submissions, movies_matched

Some movies are written in their original language when the user is writing a request, to we need a dictionary of all the alternative names that a movie can have, so that we can check them all when tagging

In [3]:
def create_alt_movie_dict(movies_matched):
    alt_names = dict()
    for index, row in movies_matched.iterrows():
        if row["original_title"] not in alt_names.keys():
            alt_names[row["original_title"]] = list()
        alt_names[row["original_title"]].append(row["alternative"])
    return alt_names

TAG GENRES, KEYWORDS, MOVIES AND ACTORS FOR ENTITY TAG AND SENTIMENT

In [4]:
"""
Iterate through every token in the dataset. Keep track of which submission it belongs to. 
Compare it against every genre entity, then actor entity and so on, that the crowdowkers 
identified for the corresponding submission. Note: these are in two different dataframes. 

The word can be misspelled, so a spell correction is run on it, in case it does not match any
of the identified entity words. If the spell-checked word matches a word in one of the entity 
columns, then a corresponding entity tag is added to it.

The identified entities in "submissions" df are written in the form: 
Actor1Name Actor1LastName | Actor2Name Actor2LastName etc. Since our training data is in token
form, we split the multi-token entities and compare token to token. 

The tags assigned are per entity, reagrdless of whether it's a first word of an entity or a 
continuation (second, third and so on). Checks are also made per sentiment. Whether the word is a part of a positive-entity or negative-
sentiment. 

Special case for the movies, in the 'submissions' df the names of the movies identified are as
written on IMDB. Sometimes, movies have alternative names in several languages, and the user
could've written it in a different lanugage, so we also keep track of the corresonding names
for every movie and match the current token against them as well.

"""
def tag_entity(data, submissions, alt_names):
    genre_tags = []
    actor_tags = []
    keyword_tags = []
    movie_tags = []
    sents_movies = []
    sents_gen = []
    sents_actor = []
    sents_keyword = []

    for index, row in data.iterrows():
        current_sentence = int(row["Sentence"].split(" ")[1])
        word = Word(row["Token"].lower())
        #token_spellcheck = word.spellcheck()[0][0]
        token_spellcheck = [w[0] for w in word.spellcheck()] # it's a list of closest words
    
    
        ### TAG GENRES ###
    
        # Assign entity tag (genre or other)
        splits = submissions.iloc[current_sentence]["genres"].split("|")
        splits = [s.lower() for s in splits]
        spellcheck = False
        for t in token_spellcheck:
            if t in splits:
                spellcheck = True
        if row["Token"].lower() in splits:
            genre_tags.append("genre")
        elif row["Token"].lower() in ['scifi', 'fi', 'sci']:
            genre_tags.append("genre")
        elif row["Token"].lower() in ['thrillers', 'documentaries', 'comedies', 'dramas', 'horrors', 'musicals']:
            genre_tags.append("genre")
        elif spellcheck:
            genre_tags.append("genre")
        #elif token_spellcheck in splits:
        #    genre_tags.append("genre")
        else:
            genre_tags.append("O")
        
        # Assign genre sentiment tag (positive, negative, neutral, or other)
        pos_splits = submissions.iloc[current_sentence]["pos_genres"].split("|")
        pos_splits = [p.lower() for p in pos_splits]
        neg_splits = submissions.iloc[current_sentence]["neg_genres"].split("|")
        neg_splits = [n.lower() for n in neg_splits]
        spellcheck_pos = False
        spellcheck_neg = False
        for t in token_spellcheck:
            if t in pos_splits:
                spellcheck_pos = True
            elif t in neg_splits:
                spellcheck_neg = True
        pos_genre = False
        neg_genre = False
        for genre in ['documentaries', 'thrillers', 'comedies', 'dramas', 'horrors', 'musicals']:
            if genre in pos_splits:
                pos_genre = True
            elif genre in neg_splits:
                neg_genre = True
        if row["Token"].lower() in pos_splits:
            sents_gen.append("pos")
        elif spellcheck_pos:
            sents_gen.append("pos")
        elif pos_genre:
            sents_gen.append("pos")
        elif row["Token"].lower() in neg_splits:
            sents_gen.append("neg")
        elif spellcheck_neg:
            sents_gen.append("neg")
        elif neg_genre:
            sents_gen.append("neg")
        else:
            sents_gen.append("O")
        
        
        ### TAG ACTORS ###
    
        # Assign entity tag
        splits = submissions.iloc[current_sentence]["actor"].split("|")
        tmp = []
        for s in splits:
            names = s.split(" ")
            for name in names:
                tmp.append(name)
        tmp = [t.lower() for t in tmp]
        spellcheck = False
        for t in token_spellcheck:
            if t in tmp:
                spellcheck = True
        if row["Token"].lower() in tmp:
            actor_tags.append("actor")
        elif spellcheck:
            actor_tags.append("actor")
        else:
            actor_tags.append("O")
        
        # Assign sentiment tag
        pos_splits = submissions.iloc[current_sentence]["pos_actor"].split("|")
        neg_splits = submissions.iloc[current_sentence]["neg_actor"].split("|")
        tmp_pos = []
        for p in pos_splits:
            names = p.split(" ")
            for name in names:
                tmp_pos.append(name)
        tmp_pos = [t.lower() for t in tmp_pos]
        tmp_neg = []
        for n in neg_splits:
            names = n.split(" ")
            for name in names:
                tmp_neg.append(name)
        tmp_neg = [t.lower() for t in tmp_neg]
        spellcheck_pos = False
        spellcheck_neg = False
        for t in token_spellcheck:
            if t in tmp_pos:
                spellcheck_pos = True
            elif t in tmp_neg:
                spellcheck_neg = True
        if row["Token"].lower() in tmp_pos:
            sents_actor.append("pos")
        elif spellcheck_pos:
            sents_actor.append("pos")
        elif row["Token"].lower() in tmp_neg:
            sents_actor.append("neg")    
        elif spellcheck_neg:
            sents_actor.append("neg")
        else:
            sents_actor.append("O")
        
        
        ### TAG KEYWORDS ###
    
        # Assign entity tag
        splits = submissions.iloc[current_sentence]["keywords"].split("|")
        splits = [s.lower() for s in splits]
        spellcheck = False
        for t in token_spellcheck:
            if t in splits:
                spellcheck = True
        if row["Token"].lower() in splits:
            keyword_tags.append("keyword")
        elif spellcheck:
            keyword_tags.append("keyword")
        else:
            keyword_tags.append("O")
        
        # Assign sentiment tag
        pos_splits = submissions.iloc[current_sentence]["pos_keywords"].split("|")
        pos_splits = [p.lower() for p in pos_splits]
        neg_splits = submissions.iloc[current_sentence]["neg_keywords"].split("|")
        spellcheck_pos = False
        spellcheck_neg = False
        for t in token_spellcheck:
            if t in pos_splits:
                spellcheck_pos = True
            elif t in neg_splits:
                spellcheck_neg = True
        if row["Token"].lower() in pos_splits:
            sents_keyword.append("pos")
        elif spellcheck_pos:
            sents_keyword.append("pos")
        elif row["Token"].lower() in neg_splits:
            sents_keyword.append("neg")
        elif spellcheck_neg:
            sents_keyword.append("neg")
        else:
            sents_keyword.append("O")
        
        ### TAG MOVIES ###
    
        # Assign entity tag
        splits = submissions.iloc[current_sentence]["movies"].split("|")
        alt_splits = []
        for s in splits:
            if s in alt_names.keys():
                alt_name_tmp = alt_names[s]
                for name in alt_name_tmp:
                    tmp = name.split(" ")
                    for t in tmp:
                        alt_splits.append(t)
        tmp = []
        for s in splits:
            names = s.split(" ")
            for name in names:
                tmp.append(name)
        tmp = [t.lower() for t in tmp]
        for alt in alt_splits:
            tmp.append(alt.lower())
        spellcheck = False
        for t in token_spellcheck:
            if t in tmp:
                spellcheck = True
        if row["Token"].lower() in tmp:
            movie_tags.append("movie")
        elif spellcheck:
            movie_tags.append("movie")
        elif row["Token"].lower() in ['lotr', 'hp']:
            movie_tags.append("movie")
        else:
            movie_tags.append("O")
        
        # Assign sentiment tag
        pos_splits = submissions.iloc[current_sentence]["pos_movies"].split("|")
        neg_splits = submissions.iloc[current_sentence]["neg_movies"].split("|")
        tmp_pos = []
        for p in pos_splits:
            names = p.split(" ")
            for name in names:
                tmp_pos.append(name)
        tmp_pos = [t.lower() for t in tmp_pos]
        for alt in alt_splits:
            tmp_pos.append(alt.lower())
        tmp_neg = []
        for n in neg_splits:
            names = n.split(" ")
            for name in names:
                tmp_neg.append(name)
        tmp_neg = [t.lower() for t in tmp_neg]
        for alt in alt_splits:
            tmp_neg.append(alt.lower())
        spellcheck_pos = False
        spellcheck_neg = False
        for t in token_spellcheck:
            if t in tmp_pos:
                spellcheck_pos = True
            elif t in tmp_neg:
                spellcheck_neg = True
        if row["Token"].lower() in tmp_pos:
            sents_movies.append("pos")
        elif spellcheck_pos:
            sents_movies.append("pos")
        elif row["Token"].lower() in tmp_neg:
            sents_movies.append("neg")    
        elif spellcheck_neg:
            sents_movies.append("neg")
        else:
            sents_movies.append("O")
        
    data['gen_tag'] = pd.Series(genre_tags)
    data['gen_sentiment'] = pd.Series(sents_gen)
    data['actor_tag'] = pd.Series(actor_tags)
    data['actor_sentiment'] = pd.Series(sents_actor)
    data['keyword_tag'] = pd.Series(keyword_tags)
    data['keyword_sentiment'] = pd.Series(sents_keyword) 
    data['movie_tag'] = pd.Series(movie_tags)
    data['movie_sentiment'] = pd.Series(sents_movies)
    
    return data

In [5]:
"""

Assuming each token in the data has a corresponding entity tag, we check whether it's an opening
tag 'B' or inner tag 'I' of the entity. For every token we check it's tag, if it's for instance
a movie tag which follows a movie tag, means it's an inner tag, if it follws a 'O' tag, means
it's a 'B' tag. 

"""
def tag_bio(data):
    bio_gens = []
    bio_actors = []
    bio_keywords = []
    bio_movies = []

    for index, row in data.iterrows():
        if index == 0:
            if row["gen_tag"] == "genre":
                bio_gens.append("B-genre")
            else:
                bio_gens.append("O")
            if row["actor_tag"] == "actor":
                bio_actors.append("B-actor")
            else:
                bio_actors.append("O")
            if row["keyword_tag"] == "keyword":
                bio_keywords.append("B-keyword")
            else:
                bio_keywords.append("O")
            if row["movie_tag"] == 'movie':
                bio_movies.append("B-movie")
            else:
                bio_movies.append("O")
            
        elif row["Token_index"] == 0:
                if row["gen_tag"] == "genre":
                    bio_gens.append("B-genre")
                else:
                    bio_gens.append("O")
                if row["actor_tag"] == "actor":
                    bio_actors.append("B-actor")
                else:
                    bio_actors.append("O")
                if row["keyword_tag"] == "keyword":
                    bio_keywords.append("B-keyword")
                else:
                    bio_keywords.append("O")
                if row["movie_tag"] == 'movie':
                    bio_movies.append("B-movie")
                else:
                    bio_movies.append("O")
        else:
            if data.iloc[index-1]["gen_tag"] == 'genre' and row["gen_tag"] == 'genre':
                bio_gens.append("I-genre")
            elif data.iloc[index-1]["gen_tag"] == 'O' and row["gen_tag"] == 'genre':
                bio_gens.append("B-genre")
            else:
                bio_gens.append("O")
            
            if data.iloc[index-1]["actor_tag"] == 'actor' and row["actor_tag"] == 'actor':
                bio_actors.append("I-actor")
            elif data.iloc[index-1]["actor_tag"] == 'O' and row["actor_tag"] == 'actor':
                bio_actors.append("B-actor")
            else:
                bio_actors.append("O")
        
            if data.iloc[index-1]["keyword_tag"] == 'keyword' and row["keyword_tag"] == 'keyword':
                bio_keywords.append("I-keyword")
            elif data.iloc[index-1]["keyword_tag"] == 'O' and row["keyword_tag"] == 'keyword':
                bio_keywords.append("B-keyword")
            else:
                bio_keywords.append("O")
        
            if data.iloc[index-1]["movie_tag"] == 'movie' and row["movie_tag"] == 'movie':
                bio_movies.append("I-movie")
            elif data.iloc[index-1]["movie_tag"] == 'O' and row["movie_tag"] == 'movie':
                bio_movies.append("B-movie")
            else:
                bio_movies.append("O")
            
    data['bio_genre'] = pd.Series(bio_gens)
    data['bio_actor'] = pd.Series(bio_actors)
    data['bio_keyword'] = pd.Series(bio_keywords)
    data['bio_movie'] = pd.Series(bio_movies)   
    
    return data

In [6]:
"""

Adding additional tag columns for every entity by combining their sentiment and their BIO tag
into one column tag per entity.

"""
def tag_bio_sentiment(data):
    bio_gens_sent = []
    bio_actors_sent = []
    bio_keywords_sent = []
    bio_movies_sent = []

    for index, row in data.iterrows():
        if row["bio_genre"] == "B-genre" and row["gen_sentiment"] == "pos":
            bio_gens_sent.append("B-gen-pos")
        elif row["bio_genre"] == "B-genre" and row["gen_sentiment"] == "neg":
            bio_gens_sent.append("B-gen-neg")
        elif row["bio_genre"] == "I-genre" and row["gen_sentiment"] == "pos":
            bio_gens_sent.append("I-gen-pos")
        elif row["bio_genre"] == "I-genre" and row["gen_sentiment"] == "neg":
            bio_gens_sent.append("I-gen-neg")
        else:
            bio_gens_sent.append("O")
    
        if row["bio_actor"] == "B-actor" and row["actor_sentiment"] == "pos":
            bio_actors_sent.append("B-actor-pos")
        elif row["bio_actor"] == "B-actor" and row["actor_sentiment"] == "neg":
            bio_actors_sent.append("B-actor-neg")
        elif row["bio_actor"] == "I-actor" and row["actor_sentiment"] == "neg":
            bio_actors_sent.append("I-actor-neg")
        elif row["bio_actor"] == "I-actor" and row["actor_sentiment"] == "pos":
            bio_actors_sent.append("I-actor-pos")
        else:
            bio_actors_sent.append("O")
    
        if row["bio_keyword"] == "B-keyword" and row["keyword_sentiment"] == "pos":
            bio_keywords_sent.append("B-keyword-pos")
        elif row["bio_keyword"] == "B-keyword" and row["keyword_sentiment"] == "neg":
            bio_keywords_sent.append("B-keyword-neg")
        elif row["bio_keyword"] == "I-keyword" and row["keyword_sentiment"] == "neg":
            bio_keywords_sent.append("I-keyword-neg")
        elif row["bio_keyword"] == "I-keyword" and row["keyword_sentiment"] == "pos":
            bio_keywords_sent.append("I-keyword-pos")
        else:
            bio_keywords_sent.append("O")
        
        if row["bio_movie"] == "B-movie" and row["movie_sentiment"] == "neg":
            bio_movies_sent.append("B-movie-neg")
        elif row["bio_movie"] == "B-movie" and row["movie_sentiment"] == "pos":
            bio_movies_sent.append("B-movie-pos")
        elif row["bio_movie"] == "I-movie" and row["movie_sentiment"] == "pos":
            bio_movies_sent.append("I-movie-pos")
        elif row["bio_movie"] == "I-movie" and row["movie_sentiment"] == "neg":
            bio_movies_sent.append("I-movie-neg")
        else:
            bio_movies_sent.append("O")

    data['bio-genre-sent'] = pd.Series(bio_gens_sent)
    data['bio-actor-sent'] = pd.Series(bio_actors_sent)
    data['bio-keywords-sent'] = pd.Series(bio_keywords_sent)
    data['bio-movies-sent'] = pd.Series(bio_movies_sent)
    
    return data

Corrections

In [7]:
"""

Very often stopwords and various punctuations are part of a name of a movie. By mathing each token
in a sentence against the identified entities, very often a regular stopword/punctuation or 
other regular word will get tagged. The following function checks for these mistakes and fixes 
them.

"""
def corrections(data):
    for i in range(1, len(data["Sentence"])-1):
        if data.iloc[i]['Token'] in ['you', 'or', 'is', 'it', 'plot', 'movie', 'movies', 'the', 'of', 'I', 'a', '-', 'and', 'in', 'The', 'A', 'to', 'no', 'for', 'i', 'my']:
            if data.iloc[i]['bio_movie'] != 'O' and data.iloc[i+1]['movie_tag'] == 'O':
                data.at[i, 'movie_tag'] = 'O'
                data.at[i, 'movie_sentiment'] = 'O'
                data.at[i, 'bio_movie'] = 'O'
                data.at[i, 'bio-movies-sent'] = 'O'
            if data.iloc[i]['bio_keyword'] == 'B-keyword' and data.iloc[i-1]['keyword_tag'] == 'O' and data.iloc[i+1]['keyword_tag'] == 'O':
                data.at[i, 'keyword_tag'] = 'O'
                data.at[i, 'keyword_sentiment'] = 'O'
                data.at[i, 'bio_keyword'] = 'O'
                data.at[i, 'bio-keywords-sent'] = 'O'
    for i in range(1, len(data["Sentence"])-1):
        if data.iloc[i]['Token'] == "'s":
            if data.iloc[i]['bio_movie'] == 'O' and data.iloc[i-1]['movie_tag'] != 'O' and data.iloc[i+1]['movie_tag'] != 'O':
                data.at[i, 'movie_tag'] = 'movie'
                data.at[i, 'movie_sentiment'] = data.iloc[i-1]['movie_sentiment']
                data.at[i, 'bio_movie'] = 'I-movie'
                if data.iloc[i-1]['movie_sentiment'] == 'pos':
                    data.at[i, 'bio-movies-sent'] = 'I-movie-pos'
                    data.at[i+1, 'bio-movies-sent'] = 'I-movie-pos'
                elif data.iloc[i-1]['movie_sentiment'] == 'neg':
                    data.at[i, 'bio-movies-sent'] = 'I-movie-neg'
                    data.at[i+1, 'bio-movies-sent'] = 'I-movie-neg'
                data.at[i+1, 'bio_movie'] = 'I-movie'
        elif data.iloc[i]['Token'] == '-':
            if data.iloc[i]['bio_movie'] != 'O' and data.iloc[i-1]['movie_tag'] == 'O':
                data.at[i, 'movie_tag'] = 'O'
                data.at[i, 'movie_sentiment'] = 'O'
                data.at[i, 'bio_movie'] = 'O'
                data.at[i, 'bio-movies-sent'] = 'O'
    return data

In [8]:
"""

We merge separate columns per entities into one column where one token can be a:
'B-movie-pos'
'I-movie-pos'
'B-movie-neg'
'I-movie-neg'
...
'B-actor-pos'
...
'O'

"""

def merge(data):
    entity_tag = []
    sentiment_tag = []
    bio_tag = []
    bio_tag_sentiment = []
    
    for index, row in data.iterrows():
        if row["gen_tag"] != 'O':
            entity_tag.append("entity")
        elif row["actor_tag"] != 'O':
            entity_tag.append("entity")
        elif row["keyword_tag"] != 'O':
            entity_tag.append("entity")
        elif row["movie_tag"] != 'O':
            entity_tag.append('entity')
        else:
            entity_tag.append('O')
    
        if row["gen_sentiment"] == 'pos':
            sentiment_tag.append('pos')
        elif row["gen_sentiment"] == 'neg':
            sentiment_tag.append('neg')
        elif row["actor_sentiment"] == 'pos':
            sentiment_tag.append('pos')
        elif row["actor_sentiment"] == 'neg':
            sentiment_tag.append('neg')
        elif row["keyword_sentiment"] == 'pos':
            sentiment_tag.append('pos')
        elif row["keyword_sentiment"] == 'neg':
            sentiment_tag.append('neg')
        elif row["movie_sentiment"] == 'pos':
            sentiment_tag.append('pos')
        elif row["movie_sentiment"] == 'neg':
            sentiment_tag.append('neg')
        else:
            sentiment_tag.append('O')
    
        if row["bio_genre"] == 'B-genre':
            bio_tag.append('B-genre')
        elif row["bio_genre"] == 'I-genre':
            bio_tag.append('I-genre')
        elif row["bio_actor"] == 'B-actor':
            bio_tag.append('B-actor')
        elif row["bio_actor"] == 'I-actor':
            bio_tag.append('I-actor')
        elif row["bio_keyword"] == 'B-keyword':
            bio_tag.append('B-keyword')
        elif row["bio_keyword"] == 'I-keyword':
            bio_tag.append('I-keyword')
        elif row["bio_movie"] == 'B-movie':
            bio_tag.append('B-movie')
        elif row["bio_movie"] == 'I-movie':
            bio_tag.append('I-movie')
        else:
            bio_tag.append('O')
    
        if row["bio-genre-sent"] == 'B-gen-pos':
            bio_tag_sentiment.append('B-gen-pos')
        elif row["bio-genre-sent"] == 'I-gen-pos':
            bio_tag_sentiment.append('I-gen-pos')
        elif row["bio-genre-sent"] == 'B-gen-neg':
            bio_tag_sentiment.append('B-gen-neg')
        elif row["bio-genre-sent"] == 'I-gen-neg':
            bio_tag_sentiment.append('I-gen-neg')
        elif row["bio-actor-sent"] == 'B-actor-pos':
            bio_tag_sentiment.append('B-actor-pos')
        elif row["bio-actor-sent"] == 'I-actor-pos':
            bio_tag_sentiment.append('I-actor-pos')
        elif row["bio-actor-sent"] == 'B-actor-neg':
            bio_tag_sentiment.append('B-actor-neg')
        elif row["bio-actor-sent"] == 'I-actor-neg':
            bio_tag_sentiment.append('I-actor-neg')
        elif row["bio-keywords-sent"] == 'B-keyword-pos':
            bio_tag_sentiment.append('B-keyword-pos')
        elif row["bio-keywords-sent"] == 'I-keyword-pos':
            bio_tag_sentiment.append('I-keyword-pos')
        elif row["bio-keywords-sent"] == 'B-keyword-neg':
            bio_tag_sentiment.append('B-keyword-neg')
        elif row["bio-keywords-sent"] == 'I-keyword-neg':
            bio_tag_sentiment.append('I-keyword-neg')
        elif row["bio-movies-sent"] == 'B-movie-pos':
            bio_tag_sentiment.append('B-movie-pos')
        elif row["bio-movies-sent"] == 'I-movie-pos':
            bio_tag_sentiment.append('I-movie-pos')
        elif row["bio-movies-sent"] == 'B-movie-neg':
            bio_tag_sentiment.append('B-movie-neg')
        elif row["bio-movies-sent"] == 'I-movie-neg':
            bio_tag_sentiment.append('I-movie-neg')
        else:
            bio_tag_sentiment.append('O')
        
    data["entity_tag"] = pd.Series(entity_tag)
    data["sentiment"] = pd.Series(sentiment_tag)
    data["bio_indic"] = pd.Series(bio_tag)
    data["BIO_sent"] = pd.Series(bio_tag_sentiment)
    return data

In [11]:
# Read data
input_dir = 'D:/TU_Graz/Thesis/Datasets/Reddit_features/'
output_dir = input_dir

# 'data' contains the tokenized submissions along with their features, so that we can add the 
# tags to them to complete the table. In order to tag every token we match them in the columns
# from 'submissions' file where the user identified an entity
data, submissions, movies_matched = read_data()

# Create alternative movie names dict
alt_names = create_alt_movie_dict(movies_matched)

In [12]:
# Create first round of tags for each entity. Per column we have 'entity'-'other'
data = tag_entity(data, submissions, alt_names)

In [14]:
data.tail()

Unnamed: 0,Sent_id,Sentence,Token_index,Token,POS_tag,POS_universal,NER_tag,NER_iob,lemma,norm,...,norm_freq,abs_freq,gen_tag,gen_sentiment,actor_tag,actor_sentiment,keyword_tag,keyword_sentiment,movie_tag,movie_sentiment
25965,5v38hx,Sentence 294,25,Prisoners,1.579455e+19,96,0,2,1.345215e+19,4.935277e+18,...,0.000154,4,O,O,O,O,O,O,O,O
25966,5v38hx,Sentence 294,26,",",2.593209e+18,97,0,2,2.593209e+18,2.593209e+18,...,0.041163,1069,O,O,O,O,O,O,O,O
25967,5v38hx,Sentence 294,27,Sicario,1.579455e+19,96,380,3,1.741044e+19,5.471049e+18,...,7.7e-05,2,O,O,O,O,O,O,O,O
25968,5v38hx,Sentence 294,28,etc,1.540191e+19,101,0,2,5.77526e+18,5.77526e+18,...,0.00181,47,O,O,O,O,O,O,O,O
25969,5v38hx,Sentence 294,29,.,1.264607e+19,97,0,2,1.264607e+19,1.264607e+19,...,0.031613,821,O,O,O,O,O,O,O,O


In [12]:


# Create second round of tags for each entity
data = tag_bio(data)

# Extend the BIO-tags with sentiment tags
data = tag_bio_sentiment(data)

# Fix the most common tags that were likely tagged wrong
data = corrections(data)

# Mix the separate entity columns into columns that contain all tags
data = merge(data)

# And save them
data.to_csv("tmp_test.csv", index = False)