In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from text_utils import clean_text
import numpy as np

UNK = "UNK"
PAD = "PAD"
NAN = "NAN"

In [25]:
from gensim.models import KeyedVectors
filename = '/mnt/069A453E9A452B8D/Ram/Downloads/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [3]:
df = pd.read_csv("science_bot/train.csv")
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question
0,415,reflected sunlight,absorbed light from Earth's atmosphere,gases in the Moon's interior,volcanic eruptions on the Moon's surface,A,The Moon is visible to observers on Earth beca...
1,158,grasses -> trees -> bushes,trees -> bushes -> grasses,bushes -> grasses -> trees,grasses -> bushes -> trees,D,Which order of succession of natural communiti...
2,1959,an ion,a nucleus,a neutron,an electron,B,If the solar system were used as a model of an...
3,2542,Gravity converts solid matter into gases and l...,Gravity causes gases and dust particles to con...,Gravity cools gases and liquids until they bec...,Gravity pushes rocks and dust particles outwar...,B,Which of the following statements best describ...
4,1059,centimeters,grams,liters,degrees Celsius,A,Which unit of measurement can be used to descr...


In [4]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question
0,592,Freshwater rivers flow to the ocean.,Saltwater fish are found in some mountain stre...,Dinosaur bones have been discovered in the mou...,Marine fossils have been found on the peaks of...,D,How do scientists know that some mountains wer...
1,1008,a pan balance,a stopwatch,a thermometer,a graduated cylinder,D,Which instrument would be best to measure the ...
2,942,color,size,smoothness,thickness,D,Skyler is selecting the kind of paper he wants...
3,1010,put all non-reusable glass in the trash.,pour all used chemicals down the sink.,collect and store recyclable material.,return used samples to natural settings.,C,The correct procedure after completing a labor...
4,1482,eye color,an infection,knowledge of soccer,length of hair,A,Which characteristic is the best example of an...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4610 entries, 0 to 4609
Data columns (total 7 columns):
id               4610 non-null object
answerA          4610 non-null object
answerB          4610 non-null object
answerC          4591 non-null object
answerD          4610 non-null object
correctAnswer    4610 non-null object
question         4610 non-null object
dtypes: object(7)
memory usage: 252.2+ KB


In [6]:
df.fillna("", inplace=True)
df["text"] = df.question +" "+ df.answerA + " " + df.answerC + " " + df.answerD

In [7]:
df.text = df.text.apply(clean_text)
df.answerA = df.answerA.apply(clean_text)
df.answerB = df.answerB.apply(clean_text)
df.answerC = df.answerC.apply(clean_text)
df.answerD = df.answerD.apply(clean_text)
df.question = df.question.apply(clean_text)
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question,text
0,592,freshwater river flow to the ocean,saltwater fish are found in some mountain stream,dinosaur bone have been discovered in the moun...,marine fossil have been found on the peak of s...,D,how do scientist know that some mountain were ...,how do scientist know that some mountain were ...
1,1008,a pan balance,a stopwatch,a thermometer,a graduated cylinder,D,which instrument would be best to measure the ...,which instrument would be best to measure the ...
2,942,color,size,smoothness,thickness,D,skyler is selecting the kind of paper he want ...,skyler is selecting the kind of paper he want ...
3,1010,put all non reusable glass in the trash,pour all used chemical down the sink,collect and store recyclable material,return used sample to natural setting,C,the correct procedure after completing a labor...,the correct procedure after completing a labor...
4,1482,eye color,an infection,knowledge of soccer,length of hair,A,which characteristic is the best example of an...,which characteristic is the best example of an...


In [8]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.text)

In [9]:
len(vectorizer.vocabulary_)

7637

In [10]:
voc = [PAD, UNK, NAN]
for word in vectorizer.vocabulary_.keys():
    voc.append(word)
    #if word in model:
    #    voc.append(word)
    #else:
    #    print(word)
    
print(len(voc))

7640


In [11]:
vocab_map = {}
with open("vocab.txt", "w") as f:
    for i, w in enumerate(voc):
        f.write(w + " " + str(i) + "\n")
        vocab_map[w] = i

In [12]:
def text_to_index_pad(text, max_len):
    text = text.strip()
    #if text == "":
    #    return str(vocab_map[NAN])
    
    index = ""
    tokens = text.split()
    curr_len = 0
    for t in tokens:
        if t in vocab_map:
            index = index + " " + str(vocab_map[t])
        else:
            index = index + " " + str(vocab_map[UNK])
        curr_len = curr_len + 1
        
    for i in range(curr_len, max_len):
        index = index + " " + str(vocab_map[PAD])
        
    return index.lstrip()

def label_to_int(label):
    if label == "A":
        return 0
    elif label == "B":
        return 1
    elif label == "C":
        return 2
    elif label == "D":
        return 3
    else:
        raise ValueError()
        
print(text_to_index_pad("my name sadasd", 5))
print(label_to_int("B"))

5820 3170 1 0 0
1


In [13]:
Amax = df.answerA.map(lambda x: len(x.split())).max()
Bmax = df.answerB.map(lambda x: len(x.split())).max()
Cmax = df.answerC.map(lambda x: len(x.split())).max()
Dmax = df.answerD.map(lambda x: len(x.split())).max()
Qmax = df.question.map(lambda x: len(x.split())).max()
print(Amax, Bmax, Cmax, Dmax, Qmax)

30 24 39 28 405


In [14]:
df.answerA = df.answerA.apply(lambda x: text_to_index_pad(x, Amax))
df.answerB = df.answerB.apply(lambda x: text_to_index_pad(x, Bmax))
df.answerC = df.answerC.apply(lambda x: text_to_index_pad(x, Cmax))
df.answerD = df.answerD.apply(lambda x: text_to_index_pad(x, Dmax))
df.question = df.question.apply(lambda x: text_to_index_pad(x, Qmax))
df.correctAnswer = df.correctAnswer.apply(label_to_int)

In [15]:
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question,text
0,592,2322 5116 876 714 6475 6516 0 0 0 0 0 0 0 0 0 ...,1948 3000 1239 2337 3845 171 4432 1495 0 0 0 0...,2425 4778 2630 1352 899 3845 6475 4432 0 0 0 0...,191 4166 2630 1352 2337 5218 6475 2131 7534 17...,3,4489 7425 3614 4583 7057 171 4432 4302 175 440...,how do scientist know that some mountain were ...
1,1008,1 5391 6886 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,1 6730 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,1 6643 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,1 7061 6625 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,3,2380 1456 106 7601 1389 714 6869 6475 595 7534...,which instrument would be best to measure the ...
2,942,1373 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,3793 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,597 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,6958 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,3,499 1084 2885 6475 3925 7534 3642 563 3294 714...,skyler is selecting the kind of paper he want ...
3,1010,2079 328 4056 1616 6710 3845 6475 5171 0 0 0 0...,5991 328 468 757 5042 6475 1175 0 0 0 0 0 0 0 ...,3594 1954 7396 6284 878 0 0 0 0 0 0 0 0 0 0 0 ...,2984 468 3130 714 2905 5568 0 0 0 0 0 0 0 0 0 ...,2,6475 1195 4863 883 2243 1 3952 5475 1084 714 0...,the correct procedure after completing a labor...
4,1482,5086 1373 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2238 2447 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,1181 7534 7367 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,4699 7534 3760 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,0,2380 7137 1084 6475 1389 6348 7534 2238 4071 2...,which characteristic is the best example of an...


In [16]:
dftrain = df.iloc[:3600]
dfval = df.iloc[3600:]

In [17]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 8 columns):
id               3600 non-null object
answerA          3600 non-null object
answerB          3600 non-null object
answerC          3600 non-null object
answerD          3600 non-null object
correctAnswer    3600 non-null int64
question         3600 non-null object
text             3600 non-null object
dtypes: int64(1), object(7)
memory usage: 225.1+ KB


In [18]:
dfval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 3600 to 4609
Data columns (total 8 columns):
id               1010 non-null object
answerA          1010 non-null object
answerB          1010 non-null object
answerC          1010 non-null object
answerD          1010 non-null object
correctAnswer    1010 non-null int64
question         1010 non-null object
text             1010 non-null object
dtypes: int64(1), object(7)
memory usage: 63.2+ KB


In [19]:
dftrain.to_csv("train_processed.csv",index=False)
dfval.to_csv("val_processed.csv",index=False)

### Test Data Processing

In [20]:
dftest = pd.read_csv("science_bot/test.csv")
dftest.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,question
0,2620,amphibians,birds,mammals,reptiles,"One type of animal hatches from an egg, breath..."
1,2187,Element 1 is polished to form a smooth surface.,Element 2 is heated and evaporates.,"Element 3 develops a white, powdery surface af...",Element 4 is separated from a mixture by filtr...,Which is a chemical change?
2,2275,seismograph,surface wave graph,magnitude graph,intensity graph,What type of instrument is used to record eart...
3,2510,digestive,excretory,immune,respiratory,"Which body system typically recognizes, attack..."
4,1093,watching television,smoking cigarettes,eating candy,exercising every day,Which activity is an example of a good health ...


In [21]:
dftest.info()
dftest.fillna("", inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 6 columns):
id          516 non-null int64
answerA     516 non-null object
answerB     516 non-null object
answerC     513 non-null object
answerD     516 non-null object
question    516 non-null object
dtypes: int64(1), object(5)
memory usage: 24.3+ KB


In [22]:
dftest.answerA = dftest.answerA.apply(clean_text)
dftest.answerB = dftest.answerB.apply(clean_text)
dftest.answerC = dftest.answerC.apply(clean_text)
dftest.answerD = dftest.answerD.apply(clean_text)
dftest.question = dftest.question.apply(clean_text)

Amax = dftest.answerA.map(lambda x: len(x.split())).max()
Bmax = dftest.answerB.map(lambda x: len(x.split())).max()
Cmax = dftest.answerC.map(lambda x: len(x.split())).max()
Dmax = dftest.answerD.map(lambda x: len(x.split())).max()
Qmax = dftest.question.map(lambda x: len(x.split())).max()
print(Amax, Bmax, Cmax, Dmax, Qmax)

dftest.answerA = dftest.answerA.apply(lambda x: text_to_index_pad(x, Amax))
dftest.answerB = dftest.answerB.apply(lambda x: text_to_index_pad(x, Bmax))
dftest.answerC = dftest.answerC.apply(lambda x: text_to_index_pad(x, Cmax))
dftest.answerD = dftest.answerD.apply(lambda x: text_to_index_pad(x, Dmax))
dftest.question = dftest.question.apply(lambda x: text_to_index_pad(x, Qmax))

dftest.head()

17 26 27 31 343


Unnamed: 0,id,answerA,answerB,answerC,answerD,question
0,2620,6242 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,2977 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,1124 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,6278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,1225 4735 7534 6668 4711 4562 2238 7220 5528 1...
1,2187,2590 1 1084 3308 714 4993 1 5205 5384 0 0 0 0 ...,2590 1 1084 3985 1954 6382 0 0 0 0 0 0 0 0 0 0...,2590 1 4607 1 384 711 5384 883 4831 3845 2616 ...,2590 1 1084 2128 4562 1 1823 691 5694 0 0 0 0 ...,2380 1084 1 757 1182 0 0 0 0 0 0 0 0 0 0 0 0 0...
2,2275,1835 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,5384 2629 6894 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,2180 6894 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,4447 6894 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,4167 4735 7534 1456 1084 468 714 7374 6850 0 0...
3,2510,3333 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,2953 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,355 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,3858 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,2380 2588 5073 540 2396 1759 1954 459 3831 763...
4,1093,981 1595 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,6374 508 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,2609 5874 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2519 288 1351 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2380 5616 1084 2238 6348 7534 1 2219 6147 3794...


In [23]:
dftest.to_csv("test_processed.csv",index=False)

### Embedding Matrix

In [26]:
embadding_mat = np.zeros((len(voc), 300))
for i, word in enumerate(voc):
    if word in model:
        embadding_mat[i] = model[word]

In [27]:
import pickle
with open("emb_mat.pkl", "wb") as outfile:
    pickle.dump(embadding_mat, outfile)