# Mutant Generation

There are 2 proposed approaches:
* Mutant Generation using POS TAG and NER
* Mutant Generation using Corefernce Resolution

### Mutant Generation using POS TAG and NER

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import spacy
# import en_core_web_sm
# nlp = en_core_web_sm.load()
# import xx_ent_wiki_sm
# nlp = xx_ent_wiki_sm.load()
import en_core_web_lg
nlp = en_core_web_lg.load()

In [2]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download xx_ent_wiki_sm
# !python -m spacy download en_core_web_lg

In [3]:
replacement = pd.read_csv("../data/asset/masculine-feminine-cleaned.txt")

m = {}
mlist = []
f = {}
flist = []
for index, row in replacement.iterrows():
    m[row['feminine']] = row['masculine']
    f[row['masculine']] = row['feminine']
    mlist.append(row["masculine"])
    flist.append(row["feminine"])

In [4]:
mnames = ["James", "John", "Robert", "Michael", "Whilliam", "Richard"]
fnames = ["Jessica", "Patricia", "Jennifer", "Linda", "Elizabeth"]

# masculine pronoun
mp = ["He", "He's", "he", "he's", "him", "his", "himself"]

# feminine prononun
fp = ["She", "He's", "she", "he's","her", "her", "herself"]

mc = {}
fc = {}

for _m, _f in zip(mp, fp) :
    fc[_m] = _f
    mc[_f] = _m

In [5]:
text = "gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character"
# text = "Good songs....but good album? debateable. She has some rockin songs on here. All her singles kickass and the last song on the album with M.I.A. (i friggin love her) is fire. but then, i start listening to the other songs, ready for a song on the album that is just as good as her singles and i can't find another. All of missy's singles are crazy, but then ui listen to the album, and its like a let down. any song that she sings in makes my stomach churn and some have just annoying beats and repititious phrases. If you really liked Missy's album 'This Is Not A Test', then get this album. but if you are lovin her singles and want some more, just download the singles, don't waste your money on the rest of this album"
# text = "A landlord book that's not based on today's reality.. Although Shemin brings up many good ideas, I found that most of his ideas are way to cheezy for me (or Southern california.) He discusses buying places for $8,000-$20,000 (I wish) and believes that charging more for rent will ultimately get you higher quality tenents. I completely disagree, especially in down markets.A decent book, overall, but I'd suggest a Nolo book which has more real legal information as well as solid do's and don'ts for a landlord."
# text = "He discusses buying places for $8,000-$20,000 (I wish) and believes that charging more for rent will ultimately get you higher quality tenents."
# text = ""

In [6]:
# nltk.download('averaged_perceptron_tagger')

In [7]:
def pos_tagging(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

def get_pronoun(text) :
    
    pt = pos_tagging(text)
    
    # subject pronoun
    spr = set()
    
    #non-subject prononun
    pr = set()

    for (word, tag) in pt :
        if "PRP" == tag :
            spr.add(word)
        elif "PRP$" == tag :
            pr.add(word)
    
    return spr, pr

spr, npr = get_pronoun(text)
print(spr)
print(npr)

{'he'}
{'his'}


In [8]:
def get_person(text) :
    doc = nlp(text)

    person = set()

    for ent in doc.ents:
#         print(ent.text, ent.start_char, ent.end_char, ent.label_)
        if (ent.label_ == "PERSON") :
            person.add(ent.text)
    
    return person
    
person = get_person(text)
person

{'Bronson', 'gingerAZ'}

In [9]:
MALE = "m"
FEMALE = "f"
UNIDENTIFIED = "u"

def get_gender(text) :
    spr, npr = get_pronoun(text)
    male = False
    female = False
    if "he" in spr or "He" in spr :
        male = True
    if "she" in spr or "She" in spr :
        female = True
    
    if male != female :
        if male :
            return MALE
        else :
            return FEMALE
    else :
        return UNIDENTIFIED

get_gender(text)

'm'

In [10]:
import tokenizer

def replace_pronoun(text) :
    
    is_replaced = False

    toklist = list(tokenizer.tokenize(text))

    # print(toklist)
    newtoklist = []
    for token in toklist :
        if token.txt in mp :
            token = token._replace(txt = fc[token.txt])
            newtoklist.append(token)
            is_replaced = True
        elif token.txt in fp :
            token = token._replace(txt = mc[token.txt])
            newtoklist.append(token)
            is_replaced = True
        else :
            newtoklist.append(token)

    # print(newtoklist)

    return is_replaced, tokenizer.detokenize(newtoklist, normalize=True)

print(text)
_, _text = replace_pronoun(text)
print(_text)

gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character
gingerAZ. Bronson's character plans to live anonymously but when she finds her fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character


In [11]:
def is_set_empty(s) :
    return not bool(s)

def generate_mutant_from_text(text) :

    mutant = []

    is_replaced, text_with_replaced_pronoun = replace_pronoun(text)

    if is_replaced :

        person = get_person(text)

        if is_set_empty(person) :
            mutant.append(text_with_replaced_pronoun)
    #         print("No name")
        elif len(person) == 1 :
    #         print("Found 1 name")

            spr, npr = get_pronoun(text)
            gender = get_gender(spr)

            if gender == MALE :

                # get the name
                prevname = person.pop()

                for name in fnames :    
                    # replace the name
                    temp_text = text_with_replaced_pronoun.replace(prevname, name)
                    mutant.append(temp_text)

            elif gender == FEMALE :

                # get the name
                prevname = person.pop()

                for name in mnames :    
                    # replace the name
                    temp_text = text_with_replaced_pronoun.replace(prevname, name)
                    mutant.append(temp_text)

            else :
                mutant.append(text_with_replaced_pronoun)

    return mutant

print(text)
generate_mutant_from_text(text)

gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character


[]

### Mutant Generation using Coreference Resolution

In [12]:
# !pip install neuralcoref
import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

In [13]:
# text = "gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character"
# text = 'My Sister has a dog. She loves him. My Brother has a car. He loves it'
# text = 'My Brother has a dog. He loves the dog'
# text = "Anne is my wife. She is clever"
# text = "Man Kills Wife, Flies To Kolkata To Kill Her Mother Before Shooting Self"
# text = "Berg, closely buttoned up in his new uniform, sat beside his wife explaining to her that one always could and should be acquainted with people above one, because only then does one get satisfaction from acquaintances."
text = "My wife is angry on me. She hits me using her hand"
# text = "Sean Bean returns as Napoleonic hero Richard Sharpe in Sharpe's Honour, the fifth movie in the series and as always Patrick Harper and the rest of Sharpes chosen men are all along for the ride, but this time Major Sharpe is in serious trouble. Under the influence of Sharpe's sworn enemy Major Ducos, a mysterious lady by the name of La Marquesa has accused Sharpe of rape. Her husband arrives at Sharpe's camp to challenge his wife's attacker to a dual. The dual is discovered and stopped by the authorities, and as a result Sharpe becomes the prime suspect when his opponent is murdered in the middle of the night. As no-one in the British Army other than Wellington and Major Nairn consider Sharpe anything but a rough commoner with little or no honour, he his given a shambolic trial and is sentenced to death by hanging, and Harper and the chosen men have no choice but to look on as their beloved commander walks slowly to the gallows. However, convinced of his innocence Wellington and Nairn hang another convicted prisoner in Sharpe's stead and release him and his chosen men to find the real killer and La Marquesa herself, to not only prove his innocence but to find out her reasons for framing him in the first place. Daragh O'Malley, Micheal Byrne and Hugh Fraser co-star with brilliant performances by Alice Krige as La Marquesa and Féodor Atkine as the villainous Major Ducos, in what is another exciting, swashbuckling instalment through Sharpe's eventful journey through the Napoleonic Wars."
text

'My wife is angry on me. She hits me using her hand'

In [14]:
# You're done. You can now use NeuralCoref the same way you usually manipulate a SpaCy document and it's annotations.
doc = nlp(text)

doc._.has_coref
doc._.coref_clusters

[My wife: [My wife, me, She, her]]

#### Learning the Result from Coreference Library

In [15]:
print(text)
print(doc._.coref_resolved)
print(doc._.coref_clusters)
print(doc._.coref_clusters[-1].main)
print(doc._.coref_clusters[-1].mentions)
print(doc._.coref_clusters[-1].mentions[-1])
print(doc._.coref_clusters[-1].mentions[-1].start)
print(doc._.coref_clusters[-1].mentions[-1].start_char)
print(doc._.coref_clusters[-1].mentions[-1].end)
print(doc._.coref_clusters[-1].mentions[-1].end_char)
print(doc._.coref_clusters[-1].mentions[-1]._.coref_cluster.main)

My wife is angry on me. She hits me using her hand
My wife is angry on My wife. My wife hits me using My wife hand
[My wife: [My wife, me, She, her]]
My wife
[My wife, me, She, her]
her
11
42
12
45
My wife


#### Create a Class for Better Implementation and Maintenance

In [16]:
# contain a word and its location inside the sentence
# The location is indicated by start char and end char
class Token: 
    word = ""
    start = -1
    end = -1
    
    def __init__(self, word, start, end) :
        self.word = word
        self.start = start
        self.end = end
        
    def __str__(self) :
        return self.word
    
    def __repr__(self) :
        return self.word
        
    def get_word(self):
        return self.word
    
    def get_start(self):
        return self.start
    
    def get_end(self):
        return self.end


# Reference is a class to save Reference data
# e.g. La Marquesa herself : [La Marquesa herself, her]
class Ref:
    
    name = ""
    reference = []
    reference_list = []
    
    def __init__(self, name, reference):
        self.name = str(name)
        self.reference = []
        self.reference_list = []
        for word in reference :
            self.reference_list.append(word.text)
            self.reference.append(Token(word.text, word.start_char, word.end_char))
            
    def __str__(self) :
        return self.name + ": " + str(self.reference_list)
    
    def __repr__(self) :
        return self.name + ": " + str(self.reference_list)
    
    def get_name(self):
        return self.name
    
    def get_reference(self):
        return self.reference
    
    # is having male subject
    def is_having_male_subject(self):
        if "He" in self.reference_list :
            return True
        elif "he" in self.reference_list :
            return True
        else :
            return False

    # is having female subject
    def is_having_female_subject(self):
        if "She" in self.reference_list :
            return True
        elif "she" in self.reference_list :
            return True
        else :
            return False

In [17]:
doc._.coref_clusters[-1].mentions[0]

My wife

In [18]:
i = 0
r = Ref(doc._.coref_clusters[i].main, doc._.coref_clusters[i].mentions)
r

My wife: ['My wife', 'me', 'She', 'her']

In [19]:
r.is_having_male_subject()

False

In [20]:
r.is_having_female_subject()

True

In [21]:
# names from EEC paper
mnames = ["Alonzo", "Adam", "Alphonse", "Alan", "Darnell", "Andrew", "Jamel", "Frank", "Jerome", "Harry", "Lamar", "Jack", "Leroy", "Josh", "Malik", "Justin", "Terrence", "Roger", "Torrance", "Ryan"]
fnames = ["Ebony", "Amanda", "Jasmine", "Betsy", "Lakisha", "Courtney", "Latisha", "Ellen", "Latoya", "Heather", "Nichelle", "Katie", "Shaniqua", "Kristin", "Shereen", "Melanie", "Tanisha", "Nancy", "Tia", "Stephanie"]

# small name for debugging
# mnames = ["Alonzo", "Adam"] 
# fnames = ["Ebony", "Amanda"]

# masculine pronoun
mp = ["He", "he", "him", "his", "himself"]

# feminine prononun
fp = ["She", "she","her", "her", "herself"]

# masculine contra -> flipper
mc = {}
fc = {}

for _m, _f in zip(mp, fp) :
    fc[_m] = _f
    mc[_f] = _m

In [22]:
class Coref:
    original = ""
    resolved = ""
    refs = []
    one_subject = False
    is_male = False
    subject_reference = None
    chunk = []
    
    def __init__(self, text):
        
        self.original = str(text)
        doc = nlp(text)
        refs = doc._.coref_clusters
        self.resolved = str(doc._.coref_resolved)
        self.refs = []
        for r in refs :
            self.refs.append(Ref(r.main, r.mentions))
            
        self.one_subject, self.subject_reference, self.is_male = self.check_one_subject()
        
        if self.one_subject :
            self.chunk = self.generate_chunk_from_coref()
            
    def get_original(self):
        return self.original
    
    def get_resolved(self):
        return self.resolved
    
    def get_refs(self):
        return self.refs
    
    def is_one_subject(self) :
        return self.one_subject
    
    def check_one_subject(self) :
        s = 0
        subject_reference = None
        for r in self.refs :
            if r.is_having_male_subject() :
                s += 1
                subject_reference = r
                is_male = True
            
            if r.is_having_female_subject() :
                s += 1
                subject_reference = r
                is_male = False
                
        if s == 1 :
            return True, subject_reference, is_male
        else :
            return False, None, None
    
    def get_subject_reference(self):
        return self.subject_reference
    
    def generate_chunk_from_coref(self) :
        chunk = []
        refs = self.subject_reference.get_reference()
        lb = 0 # lower bound
        ub = 0 # upper bound
        for i in range(len(refs)) :
            if i == 0 :
                ub = refs[i].start
                _chunk = text[lb:ub]
                if _chunk == "" :
                    chunk.append(" ")
                else :
                    chunk.append(_chunk)
            else :
                lb = refs[i-1].end
                ub = refs[i].start
                _chunk = text[lb:ub]
                if _chunk == "" :
                    chunk.append(" ")
                else :
                    chunk.append(_chunk)
                
            if i == len(refs)-1 :
                lb = refs[-1].end
                chunk.append(self.original[lb:])
        
        return chunk
    
    def generate_male_mutant_text(self):
        refs = self.subject_reference.get_reference()
        chunk = self.chunk
        mutant = []
        if self.is_male :
            for name in mnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in mp :
                        t.append(r.word)
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "male", "text": "".join(t)})
        else :
            gender = "female"
            for name in mnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in fp :
                        t.append(mc[r.word])
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "male", "text": "".join(t)})
        return mutant
            
    def generate_female_mutant_text(self):
        refs = self.subject_reference.get_reference()
        chunk = self.chunk
        mutant = []
        if self.is_male :
            for name in fnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in mp :
                        t.append(fc[r.word])
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "female", "text": "".join(t)})
        else :
            for name in fnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in fp :
                        t.append(r.word)
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "female", "text": "".join(t)})
        return mutant
    
    
    def generate_mutant_text(self):
        chunk = self.chunk
        mutant = self.generate_male_mutant_text()
        fmutant = self.generate_female_mutant_text()
        for _fmutant in fmutant :
            mutant.append(_fmutant)
        return mutant

In [23]:
c = Coref(text)

In [24]:
c.get_resolved()

'My wife is angry on My wife. My wife hits me using My wife hand'

In [25]:
for r in c.get_refs() :
    print(r.get_name())
    print(r.get_reference())

My wife
[My wife, me, She, her]


In [26]:
c.is_one_subject()

True

In [27]:
if c.is_one_subject() :
    print(c.get_subject_reference())
    print(c.generate_mutant_text())

My wife: ['My wife', 'me', 'She', 'her']
[{'gender': 'male', 'text': ' Alonzo is angry on Alonzo. He hits me using his hand'}, {'gender': 'male', 'text': ' Adam is angry on Adam. He hits me using his hand'}, {'gender': 'male', 'text': ' Alphonse is angry on Alphonse. He hits me using his hand'}, {'gender': 'male', 'text': ' Alan is angry on Alan. He hits me using his hand'}, {'gender': 'male', 'text': ' Darnell is angry on Darnell. He hits me using his hand'}, {'gender': 'male', 'text': ' Andrew is angry on Andrew. He hits me using his hand'}, {'gender': 'male', 'text': ' Jamel is angry on Jamel. He hits me using his hand'}, {'gender': 'male', 'text': ' Frank is angry on Frank. He hits me using his hand'}, {'gender': 'male', 'text': ' Jerome is angry on Jerome. He hits me using his hand'}, {'gender': 'male', 'text': ' Harry is angry on Harry. He hits me using his hand'}, {'gender': 'male', 'text': ' Lamar is angry on Lamar. He hits me using his hand'}, {'gender': 'male', 'text': ' Jack

In [28]:
print(text)

My wife is angry on me. She hits me using her hand


## Find Template from IMDB Data using Coreference Resolution Approach

In [29]:
df = pd.read_csv("../data/imdb/test.csv", sep="\t", header=None, names=["label", "text"])
df = df[:30]
df.head()

Unnamed: 0,label,text
0,1,The Mascot is Ladislaw Starewicz's masterpiece...
1,1,this is one of the finest movies i have ever s...
2,1,"I have only see three episodes of Hack, starri..."
3,1,"Deepa Mehta's ""Fire"" is groundbreaking, bold, ..."
4,0,Anyone who sees this film will notice that the...


### Filter text from IMDB that is possible to be used as our template

In [30]:
import time
start = time.time()

original_arr = []
coref_arr = []
label_arr = []

for index, row in df.iterrows():
    label = row['label']
    text = row['text']
    c = Coref(text)
    if c.is_one_subject() :
        original_arr.append(text)
        coref_arr.append(c)
        label_arr.append(label)


end = time.time()
print("Execution Time: ", end-start)

Execution Time:  3.872675895690918


In [31]:
# create dataframe
# male template and female template
mt = pd.DataFrame({"original" : original_arr, "coref" : coref_arr, "label" : label_arr})
ft = pd.DataFrame({"original" : original_arr, "coref" : coref_arr, "label" : label_arr})

In [32]:
def get_first_male_mutant_text(c) :
    return c.generate_male_mutant_text()[0]["text"]
def get_first_female_mutant_text(c) :
    return c.generate_female_mutant_text()[0]["text"]

In [33]:
mt["mutant"] = mt["coref"].apply(get_first_male_mutant_text)
ft["mutant"] = ft["coref"].apply(get_first_female_mutant_text)

In [34]:
mt

Unnamed: 0,original,coref,label,mutant
0,"I have only see three episodes of Hack, starri...",<__main__.Coref object at 0x7f78f8d69358>,1,"I have only see three episodes of Hack, starri..."
1,In the groovy mid 70's a scruffy bunch of bras...,<__main__.Coref object at 0x7f79b4284860>,1,In the groovy mid 70's a scruffy bunch of bras...
2,This must have been one of Chaplin's most ambi...,<__main__.Coref object at 0x7f78f8d62b00>,1,This must have been one of Alonzo's most ambit...


### Grammar Check from Generated Mutant Text

In [35]:
# importing the requests library 
import requests
import urllib
import json

def get_number_of_grammar_error_from_language_tool(text):

    # defining the api-endpoint
    # API is built from https://github.com/languagetool-org/languagetool
    # using docker wrapped by https://github.com/silvio/docker-languagetool
    API_ENDPOINT = "http://10.4.4.55:8010//api/v2/check"

    # data to be sent to api 
    data = {'text': urllib.parse.quote_plus(text), 
            'language': 'en-US', 
            'enabledOnly':'false'} 

    # sending post request and saving response as response object 
    response = requests.post(url = API_ENDPOINT, data = data) 

    # extracting response text  
    r = json.loads(response.text)
    
    grammar_error = r["matches"]

    return len(grammar_error)


text = "I eat chicken"
get_number_of_grammar_error_from_language_tool(text)

0

In [36]:
mt

Unnamed: 0,original,coref,label,mutant
0,"I have only see three episodes of Hack, starri...",<__main__.Coref object at 0x7f78f8d69358>,1,"I have only see three episodes of Hack, starri..."
1,In the groovy mid 70's a scruffy bunch of bras...,<__main__.Coref object at 0x7f79b4284860>,1,In the groovy mid 70's a scruffy bunch of bras...
2,This must have been one of Chaplin's most ambi...,<__main__.Coref object at 0x7f78f8d62b00>,1,This must have been one of Alonzo's most ambit...


In [37]:
ft

Unnamed: 0,original,coref,label,mutant
0,"I have only see three episodes of Hack, starri...",<__main__.Coref object at 0x7f78f8d69358>,1,"I have only see three episodes of Hack, starri..."
1,In the groovy mid 70's a scruffy bunch of bras...,<__main__.Coref object at 0x7f79b4284860>,1,In the groovy mid 70's a scruffy bunch of bras...
2,This must have been one of Chaplin's most ambi...,<__main__.Coref object at 0x7f78f8d62b00>,1,This must have been one of Ebony's most ambiti...


In [38]:
start = time.time()

merr = []
ferr = []

i = 0
for (male_mutant, female_mutant, original) in zip(mt["mutant"].values, ft["mutant"].values, mt["original"].values) :
    i += 1
    err_m = get_number_of_grammar_error_from_language_tool(male_mutant)
    err_f = get_number_of_grammar_error_from_language_tool(female_mutant)
    err_o = get_number_of_grammar_error_from_language_tool(original)
    merr.append(err_m - err_o)
    ferr.append(err_f - err_o)
    
    # using for log only
    if i % 500 == 0 :
        print(i)
        
end = time.time()
print("Execution Time: ", end-start)

Execution Time:  3.339677333831787


In [39]:
len(merr)

3

In [40]:
mt["err"] = merr
ft["err"] = ferr

In [41]:
mt

Unnamed: 0,original,coref,label,mutant,err
0,"I have only see three episodes of Hack, starri...",<__main__.Coref object at 0x7f78f8d69358>,1,"I have only see three episodes of Hack, starri...",0
1,In the groovy mid 70's a scruffy bunch of bras...,<__main__.Coref object at 0x7f79b4284860>,1,In the groovy mid 70's a scruffy bunch of bras...,0
2,This must have been one of Chaplin's most ambi...,<__main__.Coref object at 0x7f78f8d62b00>,1,This must have been one of Alonzo's most ambit...,0


### The Error calculation

In [42]:
print("Number of mutant grammar error from male: ", len(mt[mt["err"] > 0]))
print("Number of mutant grammar error from female", len(ft[ft["err"] > 0]))

Number of mutant grammar error from male:  0
Number of mutant grammar error from female 0


In [43]:
mt[mt["err"] > 0]

Unnamed: 0,original,coref,label,mutant,err


In [44]:
ft[ft["err"] > 0]

Unnamed: 0,original,coref,label,mutant,err


#### Analysing the Error

In [45]:
male_error = mt[mt["err"] > 0]
idx = 15
if len(male_error) > 0 :
    text = male_error[idx-1:idx].values[0]["original"]
    print(text)
    text = male_error[idx-1:idx].values[0]["mutant"]
    # text = "Sean Bean returns as Napoleonic hero Richard Sharpe in Alonzo's Honour, the fifth movie in the series and as always Patrick Harper and the rest of Sharpes chosen men are all along for the ride, but this time Alonzo Alonzo is in serious trouble. Under the influence of Alonzo's sworn enemy Major Ducos, a mysterious lady by the name of La Marquesa has accused Alonzo of rape. Her husband arrives at Alonzo's camp to challenge his wife's attacker to a dual. The dual is discovered and stopped by the authorities, and as a result Alonzo becomes the prime suspect when his opponent is murdered in the middle of the night. As no-one in the British Army other than Wellington and Major Nairn consider Alonzo anything but a rough commoner with little or no honour, he his given a shambolic trial and is sentenced to death by hanging, and Harper and the chosen men have no choice but to look on as their beloved commander walks slowly to the gallows. However, convinced of his innocence Wellington and Nairn hang another convicted prisoner in Alonzo's stead and release him and his chosen men to find the real killer and La Marquesa herself, to not only prove his innocence but to find out her reasons for framing him in the first place. Daragh O'Malley, Micheal Byrne and Hugh Fraser co-star with brilliant performances by Alice Krige as La Marquesa and Féodor Atkine as the villainous Major Ducos, in what is another exciting, swashbuckling instalment through Alonzo's eventful journey through the Napoleonic Wars."
    print(text)

In [46]:
if len(male_error) > 0 :
    text = male_error[idx-1:idx].values[0]["original"]
    get_number_of_grammar_error_from_language_tool(text)
    c = Coref(text)
    c.get_resolved()
    for r in c.get_refs() :
        print(r.get_name())
        print(r.get_reference())

In [47]:
if len(male_error) > 0 :
    text = male_error[idx-1:idx].values[0]["mutant"]
    get_number_of_grammar_error_from_language_tool(text)
    c = Coref(text)
    c.get_resolved()
    for r in c.get_refs() :
        print(r.get_name())
        print(r.get_reference())

### Generate Mutant After Grammar Check

In [48]:
non_bug_mt = mt[mt["err"] <= 0]
non_bug_ft = ft[ft["err"] <= 0]

In [49]:
mutant_arr = []
original_arr = []
gender_arr = []
label_arr = []

for index, row in non_bug_mt.iterrows():
    label = row['label']
    original = row['original']
    c = row['coref']
    if c.is_one_subject() :
        gm = c.generate_male_mutant_text()
        for m in gm :
            original_arr.append(original)
            mutant_arr.append(m["text"])
            gender_arr.append("male")
            label_arr.append(label)

dfm = pd.DataFrame(data={"original": original_arr, "mutant": mutant_arr, "gender": gender_arr, "label": label_arr})

In [50]:
mutant_arr = []
original_arr = []
gender_arr = []
label_arr = []

for index, row in non_bug_ft.iterrows():
    label = row['label']
    original = row['original']
    c = row['coref']
    if c.is_one_subject() :
        gm = c.generate_female_mutant_text()
        for m in gm :
            original_arr.append(original)
            mutant_arr.append(m["text"])
            gender_arr.append("female")
            label_arr.append(label)

dff = pd.DataFrame(data={"original": original_arr, "mutant": mutant_arr, "gender": gender_arr, "label": label_arr})

In [51]:
dfm.head()["mutant"].values

array(["I have only see three episodes of Hack, starring David Morse, and it looks as though I've missed 37 episodes! well thats if ITV 3 are showing them in chronological order. I've just watched 'Misty Blue' (episode 38). I have really enjoyed the 3 episodes, but then I'm a David Morse fan, (esp St. Elsewhere). For any one reading this, Hack is excellent. Pity its being aired on ITV 3. The cast is strong, though I cant get used to the idea of Alonzo playing a bent cop, still we all know he's the good guy wrongly accused. I see Gary Cole has guest starred, what ever happened to 'Midnight Caller'? Just wonder if Hack is available on DVD (yet). Lovin it. Cheers.",
       "I have only see three episodes of Hack, starring David Morse, and it looks as though I've missed 37 episodes! well thats if ITV 3 are showing them in chronological order. I've just watched 'Misty Blue' (episode 38). I have really enjoyed the 3 episodes, but then I'm a David Morse fan, (esp St. Elsewhere). For any one r

In [52]:
dff.head()["mutant"].values

array(["I have only see three episodes of Hack, starring David Morse, and it looks as though I've missed 37 episodes! well thats if ITV 3 are showing them in chronological order. I've just watched 'Misty Blue' (episode 38). I have really enjoyed the 3 episodes, but then I'm a David Morse fan, (esp St. Elsewhere). For any one reading this, Hack is excellent. Pity its being aired on ITV 3. The cast is strong, though I cant get used to the idea of Ebony playing a bent cop, still we all know she's the good guy wrongly accused. I see Gary Cole has guest starred, what ever happened to 'Midnight Caller'? Just wonder if Hack is available on DVD (yet). Lovin it. Cheers.",
       "I have only see three episodes of Hack, starring David Morse, and it looks as though I've missed 37 episodes! well thats if ITV 3 are showing them in chronological order. I've just watched 'Misty Blue' (episode 38). I have really enjoyed the 3 episodes, but then I'm a David Morse fan, (esp St. Elsewhere). For any one r

In [53]:
dfm[0] = dfm["label"]
dfm[1] = dfm["mutant"]
dfm["template"] = dfm["original"]
dfm = dfm.drop(columns=["original", "mutant", "gender", "label"])
# dfm = dfm.drop(columns=["mutant", "gender", "label"])

dff[0] = dff["label"]
dff[1] = dff["mutant"]
dff["template"] = dff["original"]
dff = dff.drop(columns=["original", "mutant", "gender", "label"])
# dff = dff.drop(columns=["mutant", "gender", "label"])

In [54]:
dfm

Unnamed: 0,0,1,template
0,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
1,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
2,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
3,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
4,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
5,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
6,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
7,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
8,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
9,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."


In [55]:
dfm.to_csv("../data/imdb_mutant/male/test.csv", index=None, header=None, sep="\t")
dff.to_csv("../data/imdb_mutant/female/test.csv", index=None, header=None, sep="\t")