In [2]:
#Name: Kathiriya Ranjit (R00183586)
# https://github.com/jsvine/markovify - Install for runnning
import markovify
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from textblob import Word
import re

from spellchecker import SpellChecker
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ranjitsmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Reading Datasets:


In [2]:
# reading the answer datasets
df_answer = pd.read_csv('./TrainingData/subtaskC_answers_all.csv',names=['id','posA','posB','posC'])
df_answer.head()

Unnamed: 0,id,posA,posB,posC
0,0,Orange juice doesn't taste good on cereal.,Orange juice is poured in a glass.,Orange juice does not taste good on cereal.
1,1,Apple can not be drunk,An apple is a whole food and unable to be drun...,He eats an apple
2,2,"100,000 miles is way to long for one person to...",This is not physically possible to achieve,"No way can someone run 100,000 miles in a day"
3,3,A human has not stings,I do not have a stinger.,"mosquitos sting people, not the other way around"
4,4,A giraffe is not a human being.,"A giraffe is an animal, not human.",A giraffe is an animal.


In [3]:
# reading the false sents
df_false = pd.read_csv('./TrainingData/subtaskC_data_all.csv')
df_false.head()

Unnamed: 0,id,FalseSent
0,0,He poured orange juice on his cereal.
1,1,He drinks apple.
2,2,"Jeff ran 100,000 miles today"
3,3,I sting a mosquito
4,4,A giraffe is a person.


In [4]:
# Merge both based on Id.
df = pd.merge(df_false,df_answer,on='id')

In [5]:
# print first rows including cols name.
df.head(1)

Unnamed: 0,id,FalseSent,posA,posB,posC
0,0,He poured orange juice on his cereal.,Orange juice doesn't taste good on cereal.,Orange juice is poured in a glass.,Orange juice does not taste good on cereal.


In [6]:
# info of all cols.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 5 columns):
id           10000 non-null int64
FalseSent    10000 non-null object
posA         10000 non-null object
posB         10000 non-null object
posC         10000 non-null object
dtypes: int64(1), object(4)
memory usage: 468.8+ KB


# Cleaning

    2.	Cleaning

        a.	Lower Case: Converting all 20,000 entries into lowercase.
        b.	Removing extra character: In this, the additional symbol like ?.'; is removed.
        c.	Stop words Removing: stop words like a the, and a terminated from the corpus.
        d.	Lemmatization: converting the text into the plural form. e.g., mice become a mouse.
        e.	Eliminating the repetitions: the word which occurs twice or more times it removes.
        f. Spelling chacker correct if it is wrong. eg. fihs --> fish


In [157]:
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1",text)
spell = SpellChecker()

In [160]:
# lower case
df['FalseSent'] = df['FalseSent'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['posA'] = df['posA'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['posB'] = df['posB'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['posC'] = df['posC'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Removing extra symbol.
df['FalseSent'] = df['FalseSent'].str.replace('[^\w\s]',' ')
df['posA'] = df['posA'].str.replace('[^\w\s]',' ')
df['posB'] = df['posB'].str.replace('[^\w\s]',' ')
df['posC'] = df['posC'].str.replace('[^\w\s]',' ')

# removing stopwords like the, a, an
df['FalseSent'] = df['FalseSent'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['posA'] = df['posA'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['posB'] = df['posB'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['posC'] = df['posC'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# Pural form through lemmatization eg. mice becomes mouse.
df['FalseSent'] = df['FalseSent'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['posA'] = df['posA'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['posB'] = df['posB'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['posC'] = df['posC'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# correcting latter and repeatations.
df['FalseSent'] = df['FalseSent'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 
df['posA'] = df['posA'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 
df['posB'] = df['posB'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 
df['posC'] = df['posC'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 

# spell check for eg. fihs --> fish
df['FalseSent'] = df['FalseSent'].apply(lambda x: " ".join(x for x in spell.correction(x)))
df['posA'] = df['posA'].apply(lambda x: " ".join(x for x in spell.correction(x)))
df['posB'] = df['posB'].apply(lambda x: " ".join(x for x in spell.correction(x)))
df['posC'] = df['posC'].apply(lambda x: " ".join(x for x in spell.correction(x)))

In [177]:
spell.correction('fihs')

'fish'

# GENERATION OF NEW TEXT USING LIBREARY

# Library : https://github.com/jsvine/markovify
        
        
        -  The Markovify is a library based on text generation. It generates text based on the corpus or given sentences. Its documentation is well-formed, and i have tested this library. It is very accurate generates text rather than any other model. It creates text on a pre-trained deep learning model.
        - for installing this library pip install markovify
        - Here the text means the data set from corpus and I am also appending each falsesentences, OptionA, OptionB,and optionC for more accuracy of model. (string)
        - state_size: indicating the number of words in the model's state. (Int)

In [235]:
# Function for generating text
def func_newgeneration(text):
    text_model = markovify.Text(text,state_size=2,retain_original=False,well_formed=False)
    for i in range(1):
        return text_model.make_short_sentence(120)

In [236]:
# append function it appends new sentences into corpus from dataframe and then open corpus for generating sentences based on corpus.
def appendFile(text):
    f=open('corpus.txt','a')
    f.seek(0) #get to the first position
    f.write(text)
    with open("corpus.txt") as f:
        text_corpas = f.read()
    

In [237]:
#generating all sentences and then storing into the listwith i as id.
datas = []
    
for i in range(df.shape[0]):
    false_sec = df['FalseSent'][i]
    posa = df['posA'][i]
    posb = df['posB'][i]
    posc = df['posC'][i]
    final_text = false_sec + posa + posb + posc
    
#      #for big corpus insert text_corpas here 
#     appendFile(final_text) # 
    text = func_newgeneration(final_text)  # text_corpas for big courpus time taking.
    datas.append((i,text))

In [238]:
# creating dataframe named id and gen_sec as new generated sentences.
df_newsec = pd.DataFrame(datas, columns = ['id', 'gen_sec'])

In [239]:
# merge with respect to id into one dataframe.
df1 = pd.merge(df,df_newsec,on='id')

In [240]:
df1.head()

Unnamed: 0,id,FalseSent,posA,posB,posC,gen_sec
0,0,poured orange juice cereal,orange juice taste good cereal,orange juice poured glass,orange juice taste good cereal,poured orange juice cerealorange juice poured ...
1,1,drink apple,apple drunk,apple whole food unable drunk without juiced,eats apple,drink appleapple drunkapple whole food unable ...
2,2,jeff ran 100 00 mile today,100 00 mile way long one person able run one day,physically possible achieve,way someone run 100 00 mile day,jeff ran 100 00 mile way long one person able ...
3,3,sting mosquito,human sting,stinger,mosquito sting people way around,sting mosquitohuman stingstingermosquito sting...
4,4,giraffe person,giraffe human,giraffe animal human,giraffe animal,giraffe persongiraffe humangiraffe animal huma...


# Training the model with Bleu Score

    - Bilingual Evaluation Understudy is comparing a candidate with multiple or a single reference translations. Further, it also can be used for evaluation purposes. 
    - I am going to use the NLTK library to obtain a blue score. 

    - A perfect match result is 1.0, whereas an ideal mismatch results in a score of 0.0, or an average match, maybe 0.5 score.
    
    - with the halp of Bleu score we can achive this below tasks.
            Language generation.
            Image caption generation.
            Text summarization.
            Speech recognition.

In [241]:
# used Bleu Score library for obtaning the blue score.
from nltk.translate.bleu_score import sentence_bleu

In [242]:
df_final = df1

In [245]:
# time taking process so, picking 10 sentences and generating Bleu score for 10 sentences 0-9 and storing data into datas list.
datas = []
for i in range(0,10):
    reference = [df_final['posA'][i].split() + df_final['posB'][i].split() + df_final['posC'][i].split()]
    candidate = df_final['gen_sec'][i].split()
    # Seating weights means seaing n-grams for eg. (0, 1, 0, 0) means biagram as on.
    score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    datas.append(score)

In [246]:
# sum of all datas and divided by total number of datasets (in our case we have taken as 10).
sum(datas) / 10

0.5954028136291661

# END