In [45]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from text_utils import clean_text
import numpy as np

UNK = "UNK"
PAD = "PAD"
NAN = "NAN"

In [13]:
from gensim.models import KeyedVectors
filename = '/mnt/069A453E9A452B8D/Ram/Downloads/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [4]:
df = pd.read_csv("science_bot/train.csv")
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question
0,415,reflected sunlight,absorbed light from Earth's atmosphere,gases in the Moon's interior,volcanic eruptions on the Moon's surface,A,The Moon is visible to observers on Earth beca...
1,158,grasses -> trees -> bushes,trees -> bushes -> grasses,bushes -> grasses -> trees,grasses -> bushes -> trees,D,Which order of succession of natural communiti...
2,1959,an ion,a nucleus,a neutron,an electron,B,If the solar system were used as a model of an...
3,2542,Gravity converts solid matter into gases and l...,Gravity causes gases and dust particles to con...,Gravity cools gases and liquids until they bec...,Gravity pushes rocks and dust particles outwar...,B,Which of the following statements best describ...
4,1059,centimeters,grams,liters,degrees Celsius,A,Which unit of measurement can be used to descr...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4610 entries, 0 to 4609
Data columns (total 7 columns):
id               4610 non-null object
answerA          4610 non-null object
answerB          4610 non-null object
answerC          4591 non-null object
answerD          4610 non-null object
correctAnswer    4610 non-null object
question         4610 non-null object
dtypes: object(7)
memory usage: 252.2+ KB


In [6]:
df.fillna("", inplace=True)
df["text"] = df.question +" "+ df.answerA + " " + df.answerC + " " + df.answerD

In [9]:
df.text = df.text.apply(clean_text)
df.answerA = df.answerA.apply(clean_text)
df.answerB = df.answerB.apply(clean_text)
df.answerC = df.answerC.apply(clean_text)
df.answerD = df.answerD.apply(clean_text)
df.question = df.question.apply(clean_text)
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question,text
0,415,reflected sunlight,absorbed light from earth s atmosphere,gas in the moon s interior,volcanic eruption on the moon s surface,A,the moon is visible to observer on earth becau...,the moon is visible to observer on earth becau...
1,158,grass tree bush,tree bush grass,bush grass tree,grass bush tree,D,which order of succession of natural community...,which order of succession of natural community...
2,1959,an ion,a nucleus,a neutron,an electron,B,if the solar system were used a a model of an ...,if the solar system were used a a model of an ...
3,2542,gravity convert solid matter into gas and ligh...,gravity cause gas and dust particle to condens...,gravity cool gas and liquid until they become ...,gravity push rock and dust particle outward fr...,B,which of the following statement best describe...,which of the following statement best describe...
4,1059,centimeter,gram,liter,degree celsius,A,which unit of measurement can be used to descr...,which unit of measurement can be used to descr...


In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.text)

In [49]:
len(vectorizer.vocabulary_)

7433

In [19]:
print(len(vectorizer.vocabulary_))
voc = [UNK, PAD, NAN]
for word in vectorizer.vocabulary_.keys():
    if word in model:
        voc.append(word)
    else:
        print(word)
print(len(voc))

7433
decomposer
betelgeuse
marta
rna
offit
aristotle
bunsen
hooke
mrna
hypha
bliesner
microfilaments
endosymbiotic
agnes
martina
heterotroph
spirogyra
niels
urey
mechanoreceptor
okaloosae
goeran
katia
linnaeus
raquel
rittman
pauling
roberta
plantae
ariana
antonia
francie
mgo
wegener
faraday
rossby
arno
feo
fossiliferous
dickson
gravitropism
matthias
helena
freda
terrence
stringlike
schwann
schleiden
gagarin
uniformitarianism
of
gemini
protista
glossopteris
lithification
hqt
winslow
weimer
agno
annette
integumentary
bohr
seismogram
chemiosmotic
metre
repare
nahco
eeeww
theodore
mendel
ammonification
epididymus
adp
nondisjunction
gregor
galapagos
moseley
notrons
catalogue
penzias
joann
argyra
punnett
halley
lystrosaurus
mesozoic
yuma
jenner
rutherford
eohippus
minamata
and
grey
centauri
saharan
sooj
mcclintock
granville
obsoleta
nonpermeable
surinam
galileo
phototropism
labium
euglena
mendoza
katelyn
dmitri
felicia
alva
lithobates
chesapeake
torricelli
supergrow
dimitri
permian
cambrian


In [28]:
vocab_map = {}
with open("vocab.txt", "w") as f:
    for i, w in enumerate(voc):
        f.write(w + " " + str(i) + "\n")
        vocab_map[w] = i

In [32]:
def text_to_index(text):
    text = text.strip()
    if text == "":
        return str(vocab_map[NAN])
    
    index = ""
    tokens = text.split()
    for t in tokens:
        if t in vocab_map:
            index = index + " " + str(vocab_map[t])
        else:
            index = index + " " + str(vocab_map[UNK])
    return index.lstrip()

def label_to_int(label):
    if label == "A":
        return 0
    elif label == "B":
        return 1
    elif label == "C":
        return 2
    elif label == "D":
        return 3
    else:
        raise ValueError()
        
print(text_to_index("my name sadasd"))
print(label_to_int("B"))

5396 2071 0
1


In [34]:
df.answerA = df.answerA.apply(text_to_index)
df.answerB = df.answerB.apply(text_to_index)
df.answerC = df.answerC.apply(text_to_index)
df.answerD = df.answerD.apply(text_to_index)
df.question = df.question.apply(text_to_index)
df.correctAnswer = df.correctAnswer.apply(label_to_int)

In [35]:
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question,text
0,415,5674 7018,2404 2826 500 3837 0 1937,3688 6494 3318 5330 0 5989,3684 5490 4618 3318 5330 0 4072,0,3318 5330 372 6723 0 2291 4618 3837 4013 0,the moon is visible to observer on earth becau...
1,158,3881 6824 3435,6824 3435 3881,3435 3881 6824,3881 3435 6824,3,6054 5168 0 1378 0 6152 130 2492 6206 4338 539...,which order of succession of natural community...
2,1959,426 1192,0 6026,0 5996,426 128,1,335 3318 2728 4385 7182 6895 0 0 2288 0 426 25...,if the solar system were used a a model of an ...
3,2542,1217 6655 3328 2414 6015 3688 0 2826 4654,1217 2470 3688 0 1176 4193 0 4929 6015 6634,1217 1257 3688 0 3444 6525 2782 1797 6753 3328...,1217 4289 7011 0 1176 4193 2272 500 0 5281 3723,1,6054 0 3318 3268 3482 6234 6529 3318 4023 0 12...,which of the following statement best describe...
4,1059,3095,641,2182,6338 6666,0,6054 1717 0 6732 2496 6259 6895 0 1131 3318 25...,which unit of measurement can be used to descr...


In [55]:
dftrain = df.iloc[:3600]
dfval = df.iloc[3600:]

In [58]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 8 columns):
id               3600 non-null object
answerA          3600 non-null object
answerB          3600 non-null object
answerC          3600 non-null object
answerD          3600 non-null object
correctAnswer    3600 non-null int64
question         3600 non-null object
text             3600 non-null object
dtypes: int64(1), object(7)
memory usage: 225.1+ KB


In [57]:
dfval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 3600 to 4609
Data columns (total 8 columns):
id               1010 non-null object
answerA          1010 non-null object
answerB          1010 non-null object
answerC          1010 non-null object
answerD          1010 non-null object
correctAnswer    1010 non-null int64
question         1010 non-null object
text             1010 non-null object
dtypes: int64(1), object(7)
memory usage: 63.2+ KB


In [59]:
dftrain.to_csv("train_processed.csv",index=False)
dfval.to_csv("val_processed.csv",index=False)

In [60]:
dftest = pd.read_csv("science_bot/test.csv")
dftest.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,question
0,2620,amphibians,birds,mammals,reptiles,"One type of animal hatches from an egg, breath..."
1,2187,Element 1 is polished to form a smooth surface.,Element 2 is heated and evaporates.,"Element 3 develops a white, powdery surface af...",Element 4 is separated from a mixture by filtr...,Which is a chemical change?
2,2275,seismograph,surface wave graph,magnitude graph,intensity graph,What type of instrument is used to record eart...
3,2510,digestive,excretory,immune,respiratory,"Which body system typically recognizes, attack..."
4,1093,watching television,smoking cigarettes,eating candy,exercising every day,Which activity is an example of a good health ...


In [61]:
dftest.info()
dftest.fillna("", inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 6 columns):
id          516 non-null int64
answerA     516 non-null object
answerB     516 non-null object
answerC     513 non-null object
answerD     516 non-null object
question    516 non-null object
dtypes: int64(1), object(5)
memory usage: 24.3+ KB


In [62]:
dftest.answerA = dftest.answerA.apply(clean_text)
dftest.answerB = dftest.answerB.apply(clean_text)
dftest.answerC = dftest.answerC.apply(clean_text)
dftest.answerD = dftest.answerD.apply(clean_text)
dftest.question = dftest.question.apply(clean_text)

dftest.answerA = dftest.answerA.apply(text_to_index)
dftest.answerB = dftest.answerB.apply(text_to_index)
dftest.answerC = dftest.answerC.apply(text_to_index)
dftest.answerD = dftest.answerD.apply(text_to_index)
dftest.question = dftest.question.apply(text_to_index)

dftest.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,question
0,2620,4900,6603,2505,2574,6753 834 0 4065 1344 500 426 189 6084 7181 471...
1,2187,78 372 3269 0 4297 0 4687 4072,78 372 1266 0 4885,78 6965 0 4417 6750 4072 453 4352 6494 2812,78 372 3012 500 0 6293 4112 7198,6054 372 0 6611 1880
2,2275,1763,4072 389 5878,2172 5878,1656 5878,3743 834 0 4161 372 6895 0 1703 83
3,2510,220,4553,5654,1351,6054 6243 4385 6788 7184 691 0 3973 5544 5004 ...
4,1093,4252 5428,79 1846,734 3419,731 6227 2259,6054 3880 372 426 2919 0 0 3943 1948 6358


In [63]:
dftest.to_csv("test_processed.csv",index=False)

In [None]:
embadding_mat = np.zeros(())