In [27]:
import os
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import trange
import re



from nltk.tokenize import RegexpTokenizer, word_tokenize, wordpunct_tokenize
from nltk.corpus import wordnet
import spacy 
  
# Load English tokenizer, tagger,  
# parser, NER and word vectors 
nlp = spacy.load("en_core_web_sm") 

#Feature importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA



#importing machine learning libraries
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()
import pickle



#Secondary imports
import pandas_profiling as pp
from scipy.stats import pearsonr
import pickle


%matplotlib inline

np.set_printoptions(precision=2, suppress=True)

## Reading Datasets

In [28]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
result = pd.read_csv('results.csv')
sample = pd.read_csv('sample_submission.csv')

In [29]:
tokenizer = re.compile('([\s.,;:()]+)')

In [30]:
train.head()

Unnamed: 0,question,answer_text,distractor
0,Meals can be served,in rooms at 9:00 p. m.,"'outside the room at 3:00 p. m.', 'in the dini..."
1,It can be inferred from the passage that,The local government can deal with the problem...,"'If some tragedies occur again ', ' relevant d..."
2,The author called Tommy 's parents in order to,help them realize their influence on Tommy,"'blame Tommy for his failing grades', 'blame T..."
3,It can be inferred from the passage that,the writer is not very willing to use idioms,'idioms are the most important part in a langu...
4,How can we deal with snake wounds according to...,Stay calm and do n't move .,'Cut the wound and suck the poison out .'


In [31]:
test.head()

Unnamed: 0,question,answer_text
0,What 'S the main idea of the text ?,The lack of career -- based courses in US high...
1,"In the summer high season , Finland does nt se...",the sun is out at night
2,If you want to apply for Chinese Business Inte...,have to get confirmed at least twice
3,"That afternoon , the boy 's clothes were dry b...",nobody made room for him in the water .
4,Which of the following statements is NOT true ?,There are twelve countries in the World Wildli...


## This will be first approach

In [32]:
train['answer_text_broken'] = train['answer_text'].str.lower()
train['answer_text_broken'] = train['answer_text_broken'].apply(lambda x : re.split(tokenizer, x))


In [33]:
''.join(train['answer_text'][1])

'The local government can deal with the problem of lacking money by some means .'

## Commencing 3 stage prediction

In [59]:
centre = train['answer_text_broken'].values
done = [0 for _ in range(len(centre))]
dist = [[] for _ in range(len(centre))]

In [60]:
def syn_ant(word):
    ant = list()
    syn = list()

    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            syn.append(lemma.name())    #add the synonyms
            if lemma.antonyms():    #When antonyms are available, add them into the list
                ant.append(lemma.antonyms()[0].name())
    ant = list(set(ant))
    syn = list(set(syn))
    
    return syn[:4], ant

In [62]:
#Going to execute 3 stage approach for all the 2 distractors
for i in trange(len(centre)):
    #Stage 1: based on changing numbers
    for j in range(len(centre[i])):
        try:
            int(centre[i][j])
            centre[i][j] = str(int(centre[i][j]) + 1)
            dist[i].append(''.join(centre[i]))
            centre[i][j] = str(int(centre[i][j]) + 1)
            dist[i].append(''.join(centre[i]))
            dist[i].append('None of the above')
            done[i] += 3
            break
        except:
            pass
    #Stage 2: based on antonyms
    if done[i] == 0:
        st2 = nlp(train['answer_text'].values[i])
        for j in range(len(st2)):
            if st2[j].pos_ == 'ADJ':
                if len(syn_ant(str(st2[j]))[1])>2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[1][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[1]) == 1:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[0][0] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[0]) >= 2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[0][0], syn_ant(str(st2[j]))[0][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
    if done[i] == 0:
        st2 = nlp(train['answer_text'].values[i])
        for j in range(len(st2)):
            if st2[j].pos_ == 'ADP':
                if len(syn_ant(str(st2[j]))[1])>2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[1][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[1]) == 1:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[0][0] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[0]) >= 2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[0][0], syn_ant(str(st2[j]))[0][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
    if done[i] == 0:
        st2 = nlp(train['answer_text'].values[i])
        for j in range(len(st2)):
            if st2[j].pos_ == 'NOUN':
                if len(syn_ant(str(st2[j]))[1])>2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[1][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[1]) == 1:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[0][0] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[0]) >= 2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[0][0], syn_ant(str(st2[j]))[0][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
    if done[i] == 0:
        st2 = nlp(train['answer_text'].values[i])
        for j in range(len(st2)):
            if st2[j].pos_ == 'VERB':
                if len(syn_ant(str(st2[j]))[1])>2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[1][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[1]) == 1:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[0][0] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[0]) >= 2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[0][0], syn_ant(str(st2[j]))[0][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
    if done[i] == 0:
        st2 = nlp(train['answer_text'].values[i])
        for j in range(len(st2)):
            if st2[j].pos_ == 'ADV':
                if len(syn_ant(str(st2[j]))[1])>2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[1][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[1]) == 1:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[0][0] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[0]) >= 2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[0][0], syn_ant(str(st2[j]))[0][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
    if done[i] == 0:
        st2 = nlp(train['answer_text'].values[i])
        for j in range(len(st2)):
            if st2[j].pos_ == 'PROPN':
                if len(syn_ant(str(st2[j]))[1])>2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[1][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[1]) == 1:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[1][0], syn_ant(str(st2[j]))[0][0] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break
                    
                elif len(syn_ant(str(st2[j]))[0]) >= 2:
                    c1, c2 = str(st2), str(st2)
                    an1, an2 = syn_ant(str(st2[j]))[0][0], syn_ant(str(st2[j]))[0][1] 
                    c1 = c1.replace(str(st2[j]), an1)
                    c2 = c2.replace(str(st2[j]), an2)
                    dist[i].append(c1)
                    dist[i].append(c2)
                    dist[i].append('None of the above')
                    done[i]+=3
                    break   
    if done[i] == 0:
        st2 = nlp(train['answer_text'].values[i])
        for j in range(len(st2)):
            if st2[j].pos_ == 'CCONJ' or st2[j].pos_ == 'CONJ':
                c1, c2 = str(st2), str(st2)
                chan = False
                if c1 == 'AND':
                    c1 = 'OR'
                    c2 = 'NOR'
                    chan = True
                elif c1 == 'OR':
                    c1 = 'AND'
                    c2 = 'NOR'
                    chan = True
                if c2 == 'AND':
                    c2 = 'OR'
                    c1 = 'NOR'
                    chan = True
                elif c2 == 'OR':
                    c2 = 'AND'
                    c1 = 'NOR'
                    chan = True
                if chan==False:
                    c1 ,c2= 'AND', 'OR'
                c1 = c1.replace(str(st2[j]), an1)
                c2 = c2.replace(str(st2[j]), an2)
                dist[i].append(c1)
                dist[i].append(c2)
                dist[i].append('None of the above')
                done[i]+=3
                break
    if done[i] == 0:
        dist[i].append(c1)
        dist[i].append(c2)
        dist[i].append('None of the above')
        done[i]+=3
        break
                 
#     if done[i] == 0:
#         st2 = nlp(train['answer_text'].values[i])                
#         len(syn_ant(str(st2[j]))[0]) >= 2:
#         c1, c2 = str(st2), str(st2)
#         an1, an2 = syn_ant(str(st2[j]))[0][0], syn_ant(str(st2[j]))[0][1] 
#         c1 = c1.replace(str(st2[j]), an1)
#         c2 = c2.replace(str(st2[j]), an2)
#         dist[i].append(c1)
#         dist[i].append(c2)
#         dist[i].append('None of the above')
#         done[i]+=3
    

100%|██████████| 31499/31499 [05:20<00:00, 105.70it/s]


In [63]:
np.unique(done, return_counts=True)

(array([0, 3]), array([  156, 31343]))

In [66]:
np.argsort(done)

array([  665, 20829,  9069, ..., 10529, 10527, 31498])

In [68]:
centre[665]

['traveline']

In [52]:
for i in range(len(st2)):
    print(st2[i], st2[i].pos_)

Sony PROPN
Ericsson PROPN
's PART
W800i PROPN
and CCONJ
Sumsung PROPN
's PART
SPH PROPN
- PUNCT
V5400 PROPN


In [22]:
for j in range(len(st2o)):
    if st2o[j].pos_ == 'ADP':
        print(st2o[j])

NameError: name 'st2o' is not defined

In [24]:
st2

'A discussion about what makes a good student .'

In [58]:
syn_ant('and')

([], [])

In [239]:
train['answer_text']

0                                   in rooms at 9:00 p. m.
1        The local government can deal with the problem...
2               help them realize their influence on Tommy
3             the writer is not very willing to use idioms
4                              Stay calm and do n't move .
5                        She missed her family very much .
6                    Chen Jianqing and one of her partners
7        Soccer is popular all over the world , but tru...
8                                                    panic
9        Five children died in a kindergarten bus accid...
10                     we have a fight with our classmates
11       The king tried to control the national parliam...
12       Wang Jinbi wants to transform her hukou back ,...
13       Poverty and family problems contribute to chil...
14       Fighting back tears may cause some health prob...
15       choosing food is related to Asian - American '...
16              make America get ahead in digital learni

In [139]:
np.unique(done, return_counts=True) # So for around 763 samples we can just change numbers and create different options

(array([0, 1]), array([30736,   763]))

In [135]:
str(int(centre[0][6]) + 1)

'10'

In [190]:
syn_ant('other')[1]

['same']

In [166]:
sample.head()

Unnamed: 0,question,answer_text,distractor
0,"Which city's landmarks include: The Pantheon, ...",Rome,"‘Barcelona’,’Athens’,’Istanbul’"
1,What is the color of Donald Duck's bowtie?,Red,"‘Blue’,’Yellow’,’Green’"
2,What’s a web browser?,A software program that allows you to access s...,"‘A kind of spider’,’A computer that stores WWW..."


In [167]:
sample['distractor'][0]

'‘Barcelona’,’Athens’,’Istanbul’'