# Mutant Generation

### Prepare Masculine and Feminine Word

In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv("asset/masculine-feminine.txt")
df.head()

Unnamed: 0,masculine,feminine
0,actor,actress
1,author,authoress
2,boy,girl
3,bridegroom,bride
4,brother,sister


In [9]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from string import punctuation
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

def list_to_string(l):
    return " ".join(l)

wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer("english")

text = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

def lemmatize_stemming(text) :
    text = text.lower()
    text = strip_punctuation(text)
    word_tokens = nltk.word_tokenize(text)
    word_tokens = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    word_tokens = [snowball_stemmer.stem(word) for word in word_tokens]
    return list_to_string(word_tokens)
    
def lowercasing(text):
    return text.lower()

lemmatize_stemming(text)

'this is a demo text for nlp use nltk full form of nltk is natur languag toolkit'

In [11]:
df["masculine"] = df["masculine"].apply(lowercasing)
df["feminine"] = df["feminine"].apply(lowercasing)

In [12]:
df = df.drop_duplicates().reset_index(drop=True)

In [13]:
df.to_csv("asset/masculine-feminine-cleaned.txt", index=False)

### Mutant Generation using POS TAG and NER

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import spacy
# import en_core_web_sm
# nlp = en_core_web_sm.load()
# import xx_ent_wiki_sm
# nlp = xx_ent_wiki_sm.load()
import en_core_web_lg
nlp = en_core_web_lg.load()

In [32]:
replacement = pd.read_csv("asset/masculine-feminine-cleaned.txt")

m = {}
mlist = []
f = {}
flist = []
for index, row in replacement.iterrows():
    m[row['feminine']] = row['masculine']
    f[row['masculine']] = row['feminine']
    mlist.append(row["masculine"])
    flist.append(row["feminine"])

In [33]:
mnames = ["James", "John", "Robert", "Michael", "Whilliam", "Richard"]
fnames = ["Jessica", "Patricia", "Jennifer", "Linda", "Elizabeth"]

# masculine pronoun
mp = ["He", "He's", "he", "he's", "him", "his", "himself"]

# feminine prononun
fp = ["She", "He's", "she", "he's","her", "her", "herself"]

mc = {}
fc = {}

for _m, _f in zip(mp, fp) :
    fc[_m] = _f
    mc[_f] = _m

In [34]:
text = "gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character"
# text = "Good songs....but good album? debateable. She has some rockin songs on here. All her singles kickass and the last song on the album with M.I.A. (i friggin love her) is fire. but then, i start listening to the other songs, ready for a song on the album that is just as good as her singles and i can't find another. All of missy's singles are crazy, but then ui listen to the album, and its like a let down. any song that she sings in makes my stomach churn and some have just annoying beats and repititious phrases. If you really liked Missy's album 'This Is Not A Test', then get this album. but if you are lovin her singles and want some more, just download the singles, don't waste your money on the rest of this album"
# text = "A landlord book that's not based on today's reality.. Although Shemin brings up many good ideas, I found that most of his ideas are way to cheezy for me (or Southern california.) He discusses buying places for $8,000-$20,000 (I wish) and believes that charging more for rent will ultimately get you higher quality tenents. I completely disagree, especially in down markets.A decent book, overall, but I'd suggest a Nolo book which has more real legal information as well as solid do's and don'ts for a landlord."
# text = "He discusses buying places for $8,000-$20,000 (I wish) and believes that charging more for rent will ultimately get you higher quality tenents."
# text = ""

In [35]:
# nltk.download('averaged_perceptron_tagger')

In [36]:
def pos_tagging(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

def get_pronoun(text) :
    
    pt = pos_tagging(text)
    
    # subject pronoun
    spr = set()
    
    #non-subject prononun
    pr = set()

    for (word, tag) in pt :
        if "PRP" == tag :
            spr.add(word)
        elif "PRP$" == tag :
            pr.add(word)
    
    return spr, pr

spr, npr = get_pronoun(text)
print(spr)
print(npr)

{'he'}
{'his'}


In [37]:
def get_person(text) :
    doc = nlp(text)

    person = set()

    for ent in doc.ents:
#         print(ent.text, ent.start_char, ent.end_char, ent.label_)
        if (ent.label_ == "PERSON") :
            person.add(ent.text)
    
    return person
    
person = get_person(text)
person

{'Bronson'}

In [53]:
MALE = "m"
FEMALE = "f"
UNIDENTIFIED = "u"

def get_gender(text) :
    spr, npr = get_pronoun(text)
    male = False
    female = False
    if "he" in spr or "He" in spr :
        male = True
    if "she" in spr or "She" in spr :
        female = True
    
    if male != female :
        if male :
            return MALE
        else :
            return FEMALE
    else :
        return UNIDENTIFIED

get_gender(text)

'u'

In [39]:
import tokenizer

def replace_pronoun(text) :
    
    is_replaced = False

    toklist = list(tokenizer.tokenize(text))

    # print(toklist)
    newtoklist = []
    for token in toklist :
        if token.txt in mp :
            token = token._replace(txt = fc[token.txt])
            newtoklist.append(token)
            is_replaced = True
        elif token.txt in fp :
            token = token._replace(txt = mc[token.txt])
            newtoklist.append(token)
            is_replaced = True
        else :
            newtoklist.append(token)

    # print(newtoklist)

    return is_replaced, tokenizer.detokenize(newtoklist, normalize=True)

print(text)
_, _text = replace_pronoun(text)
print(_text)

gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character
gingerAZ. Bronson's character plans to live anonymously but when she finds her fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character


In [40]:
def is_set_empty(s) :
    return not bool(s)

def generate_mutant_from_text(text) :

    mutant = []

    is_replaced, text_with_replaced_pronoun = replace_pronoun(text)

    if is_replaced :

        person = get_person(text)

        if is_set_empty(person) :
            mutant.append(text_with_replaced_pronoun)
    #         print("No name")
        elif len(person) == 1 :
    #         print("Found 1 name")

            spr, npr = get_pronoun(text)
            gender = get_gender(spr)

            if gender == MALE :

                # get the name
                prevname = person.pop()

                for name in fnames :    
                    # replace the name
                    temp_text = text_with_replaced_pronoun.replace(prevname, name)
                    mutant.append(temp_text)

            elif gender == FEMALE :

                # get the name
                prevname = person.pop()

                for name in mnames :    
                    # replace the name
                    temp_text = text_with_replaced_pronoun.replace(prevname, name)
                    mutant.append(temp_text)

            else :
                mutant.append(text_with_replaced_pronoun)

    return mutant

print(text)
generate_mutant_from_text(text)

gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character


["gingerAZ. Jessica's character plans to live anonymously but when she finds her fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Jessica's character",
 "gingerAZ. Patricia's character plans to live anonymously but when she finds her fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Patricia's character",
 "gingerAZ. Jennifer's character plans to live anonymously but when she finds her fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Jennifer's character",
 "gingerAZ. Linda's character plans to live anonymously but when she finds her fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Linda's character",
 "gingerAZ. Elizabeth's character plans to live anonymously but when she finds her fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Eliza

### Coreference Task

In [247]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download xx_ent_wiki_sm
# !python -m spacy download en_core_web_lg

In [248]:
import spacy
# import en_core_web_sm
# nlp = en_core_web_sm.load()
# import xx_ent_wiki_sm
# nlp = xx_ent_wiki_sm.load()
import en_core_web_lg
nlp = en_core_web_lg.load()

# !pip install neuralcoref
import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

In [621]:
text = "gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character"
# text = 'My Sister has a dog. She loves him'
# text = 'My Brother has a dog. He loves the dog'
text

"gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character"

In [622]:
# You're done. You can now use NeuralCoref the same way you usually manipulate a SpaCy document and it's annotations.
doc = nlp(text)

doc._.has_coref
doc._.coref_clusters

[Bronson's character: [Bronson's character, he, his]]

In [626]:
print(text)
print(doc._.coref_resolved)
print(doc._.coref_clusters)
print(doc._.coref_clusters[-1].main)
print(doc._.coref_clusters[-1].mentions)
print(doc._.coref_clusters[-1].mentions[-1])
print(doc._.coref_clusters[-1].mentions[-1].start)
print(doc._.coref_clusters[-1].mentions[-1].start_char)
print(doc._.coref_clusters[-1].mentions[-1].end)
print(doc._.coref_clusters[-1].mentions[-1].end_char)
print(doc._.coref_clusters[-1].mentions[-1]._.coref_cluster.main)

gingerAZ. Bronson's character plans to live anonymously but when he finds his fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character
gingerAZ. Bronson's character plans to live anonymously but when Bronson's character finds Bronson's character fiance was a victim of a protection racket one by one the criminals are hunted down in the style of Bronson's character
[Bronson's character: [Bronson's character, he, his]]
Bronson's character
[Bronson's character, he, his]
his
13
74
14
77
Bronson's character


In [627]:
doc = nlp(text)

doc._.has_coref
doc._.coref_clusters[0].main

Bronson's character

In [628]:
class Token: 
    word = ""
    start = -1
    end = -1
    
    def __init__(self, word, start, end) :
        self.word = word
        self.start = start
        self.end = end
        
    def __str__(self) :
        return self.word
    
    def __repr__(self) :
        return self.word
        
    def get_word(self):
        return self.word
    
    def get_start(self):
        return self.start
    
    def get_end(self):
        return self.end


class Ref:
    
    name = ""
    reference = []
    reference_list = []
    
    def __init__(self, name, reference):
        self.name = str(name)
        self.reference = []
        self.reference_list = []
        for word in reference :
            self.reference_list.append(word.text)
            self.reference.append(Token(word.text, word.start_char, word.end_char))
            
    def __str__(self) :
        return self.name + ": " + str(self.reference_list)
    
    def __repr__(self) :
        return self.name + ": " + str(self.reference_list)
    
    def get_name(self):
        return self.name
    
    def get_reference(self):
        return self.reference
    
    def is_male(self):
        if "He" in self.reference_list :
            return True
        elif "he" in self.reference_list :
            return True
        else :
            return False

    def is_female(self):
        if "She" in self.reference_list :
            return True
        elif "she" in self.reference_list :
            return True
        else :
            return False

In [632]:
doc._.coref_clusters[-1].mentions[0]

Bronson's character

In [633]:
i = 0
r = Ref(doc._.coref_clusters[i].main, doc._.coref_clusters[i].mentions)
r

Bronson's character: ["Bronson's character", 'he', 'his']

In [634]:
r.is_male()

True

In [692]:
mnames = ["James", "John", "Robert", "Michael", "Richard"]
fnames = ["Jessica", "Patricia", "Jennifer", "Linda", "Elizabeth"]

# masculine pronoun
mp = ["He", "he", "him", "his", "himself"]

# feminine prononun
fp = ["She", "she","her", "her", "herself"]

mc = {}
fc = {}

for _m, _f in zip(mp, fp) :
    fc[_m] = _f
    mc[_f] = _m

In [693]:
class Coref:
    original = ""
    resolved = ""
    refs = []
    one_subject = False
    is_male = False
    subject_reference = None
    
    def __init__(self, text):
        
        self.original = str(text)
        doc = nlp(text)
        refs = doc._.coref_clusters
        self.resolved = str(doc._.coref_resolved)
        self.refs = []
        for r in refs :
            self.refs.append(Ref(r.main, r.mentions))
            
        self.one_subject, self.subject_reference, self.is_male = self.check_one_subject()
            
    def get_original(self):
        return self.original
    
    def get_resolved(self):
        return self.resolved
    
    def get_refs(self):
        return self.refs
    
    def is_one_subject(self) :
        return self.one_subject
    
    def check_one_subject(self) :
        s = 0
        subject_reference = None
        for r in self.refs :
            if r.is_male() :
                s += 1
                subject_reference = r
                is_male = True
            
            if r.is_female() :
                s += 1
                subject_reference = r
                is_male = False
                
        if s == 1 :
            return True, subject_reference, is_male
        else :
            return False, None, None
    
    def get_subject_reference(self):
        return self.subject_reference
    
    def generate_mutant_text(self):
        chunk = []
        refs = self.subject_reference.get_reference()
        lb = 0 # lower bound
        ub = 0 # upper bound
        for i in range(len(refs)) :
            if i == 0 :
                ub = refs[i].start
                chunk.append(self.original[:ub])
            else :
                lb = refs[i-1].end
                ub = refs[i].start
                chunk.append(self.original[lb:ub])
                
            if i == len(refs)-1 :
                lb = refs[-1].end
                chunk.append(self.original[lb:])
        
        mutant = []
        if self.is_male :
            for name in mnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in mp :
                        t.append(r.word)
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "male", "text": "".join(t)})
            for name in fnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in mp :
                        t.append(fc[r.word])
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "female", "text": "".join(t)})
        else :
            gender = "female"
            for name in mnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in fp :
                        t.append(mc[r.word])
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "male", "text": "".join(t)})
            for name in fnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in fp :
                        t.append(r.word)
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "female", "text": "".join(t)})
        return mutant

In [694]:
c = Coref(text)

In [695]:
c.get_resolved()

"Homecoming; what a HUGE disappointment!! After reading the plot summary (the dead coming back to vote - AGAINST George W. Bush!!!!!) I couldn't wait to see this. this started off interesting and this immediately caught my attention. Unfortunately, though, this slowly descended into a boring political satire that I didn't need to see (I can just watch some good old Aussie comedy for that!). There was pretty much only one or two scenes of horror and they weren't even that scary. I couldn't believe this came from Joe Dante, who could easily have pulled this off with an equal balance of thrills and satire. The worst episode so far. 2/5."

In [696]:
for r in c.get_refs() :
    print(r.get_name())
    print(r.get_reference())

this
[this, It, it, it]
this
[this, it]


In [697]:
c.is_one_subject()

False

In [698]:
if c.is_one_subject() :
    print(c.get_subject_reference())
    print(c.generate_mutant_text())

In [699]:
print(text)

Homecoming; what a HUGE disappointment!! After reading the plot summary (the dead coming back to vote - AGAINST George W. Bush!!!!!) I couldn't wait to see this. It started off interesting and it immediately caught my attention. Unfortunately, though, it slowly descended into a boring political satire that I didn't need to see (I can just watch some good old Aussie comedy for that!). There was pretty much only one or two scenes of horror and they weren't even that scary. I couldn't believe this came from Joe Dante, who could easily have pulled it off with an equal balance of thrills and satire. The worst episode so far. 2/5.


### Generate Mutant from IMDB Data

In [700]:
df = pd.read_csv("imdb/test.csv", sep="\t", header=None, names=["label", "text"])
df = df[:100]
df.head()

Unnamed: 0,label,text
0,1,The Mascot is Ladislaw Starewicz's masterpiece...
1,1,this is one of the finest movies i have ever s...
2,1,"I have only see three episodes of Hack, starri..."
3,1,"Deepa Mehta's ""Fire"" is groundbreaking, bold, ..."
4,0,Anyone who sees this film will notice that the...


In [701]:
mutants = []
for index, row in df.iterrows():
    label = row['label']
    text = row['text']
    c = Coref(text)
    if c.is_one_subject() :
        gm = c.generate_mutant_text()
        for m in gm :
            mutants.append({"original": text, "mutant": m["text"], "gender": m["gender"], "label": label})

In [711]:
df_mutant = pd.DataFrame(data=mutants)
df_mutant

Unnamed: 0,original,mutant,gender,label
0,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri...",male,1
1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri...",male,1
2,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri...",male,1
3,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri...",male,1
4,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri...",male,1
...,...,...,...,...
205,"When I was younger, this movie always aired on...","When I was younger, this movie always aired on...",female,1
206,"When I was younger, this movie always aired on...","When I was younger, this movie always aired on...",female,1
207,"When I was younger, this movie always aired on...","When I was younger, this movie always aired on...",female,1
208,"When I was younger, this movie always aired on...","When I was younger, this movie always aired on...",female,1


In [709]:
df_mutant[:1].values

array([["I have only see three episodes of Hack, starring David Morse, and it looks as though I've missed 37 episodes! well thats if ITV 3 are showing them in chronological order. I've just watched 'Misty Blue' (episode 38). I have really enjoyed the 3 episodes, but then I'm a David Morse fan, (esp St. Elsewhere). For any one reading this, Hack is excellent. Pity its being aired on ITV 3. The cast is strong, though I cant get used to the idea of David playing a bent cop, still we all know he's the good guy wrongly accused. I see Gary Cole has guest starred, what ever happened to 'Midnight Caller'? Just wonder if Hack is available on DVD (yet). Lovin it. Cheers.",
        "I have only see three episodes of Hack, starring David Morse, and it looks as though I've missed 37 episodes! well thats if ITV 3 are showing them in chronological order. I've just watched 'Misty Blue' (episode 38). I have really enjoyed the 3 episodes, but then I'm a David Morse fan, (esp St. Elsewhere). For any one 

In [710]:
df_mutant.to_csv("imdb_mutant/test.csv", index=False)