In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from text_utils import clean_text
import numpy as np

UNK = "UNK"
PAD = "PAD"
NAN = "NAN"

In [2]:
from gensim.models import KeyedVectors
filename = '/mnt/069A453E9A452B8D/Ram/Downloads/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [3]:
df = pd.read_csv("science_bot/train.csv")
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question
0,415,reflected sunlight,absorbed light from Earth's atmosphere,gases in the Moon's interior,volcanic eruptions on the Moon's surface,A,The Moon is visible to observers on Earth beca...
1,158,grasses -> trees -> bushes,trees -> bushes -> grasses,bushes -> grasses -> trees,grasses -> bushes -> trees,D,Which order of succession of natural communiti...
2,1959,an ion,a nucleus,a neutron,an electron,B,If the solar system were used as a model of an...
3,2542,Gravity converts solid matter into gases and l...,Gravity causes gases and dust particles to con...,Gravity cools gases and liquids until they bec...,Gravity pushes rocks and dust particles outwar...,B,Which of the following statements best describ...
4,1059,centimeters,grams,liters,degrees Celsius,A,Which unit of measurement can be used to descr...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4610 entries, 0 to 4609
Data columns (total 7 columns):
id               4610 non-null object
answerA          4610 non-null object
answerB          4610 non-null object
answerC          4591 non-null object
answerD          4610 non-null object
correctAnswer    4610 non-null object
question         4610 non-null object
dtypes: object(7)
memory usage: 252.2+ KB


In [5]:
df.fillna("", inplace=True)
df["text"] = df.question +" "+ df.answerA + " " + df.answerC + " " + df.answerD

In [6]:
df.text = df.text.apply(clean_text)
df.answerA = df.answerA.apply(clean_text)
df.answerB = df.answerB.apply(clean_text)
df.answerC = df.answerC.apply(clean_text)
df.answerD = df.answerD.apply(clean_text)
df.question = df.question.apply(clean_text)
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question,text
0,415,reflected sunlight,absorbed light from earth s atmosphere,gas in the moon s interior,volcanic eruption on the moon s surface,A,the moon is visible to observer on earth becau...,the moon is visible to observer on earth becau...
1,158,grass tree bush,tree bush grass,bush grass tree,grass bush tree,D,which order of succession of natural community...,which order of succession of natural community...
2,1959,an ion,a nucleus,a neutron,an electron,B,if the solar system were used a a model of an ...,if the solar system were used a a model of an ...
3,2542,gravity convert solid matter into gas and ligh...,gravity cause gas and dust particle to condens...,gravity cool gas and liquid until they become ...,gravity push rock and dust particle outward fr...,B,which of the following statement best describe...,which of the following statement best describe...
4,1059,centimeter,gram,liter,degree celsius,A,which unit of measurement can be used to descr...,which unit of measurement can be used to descr...


In [7]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.text)

In [8]:
len(vectorizer.vocabulary_)

7434

In [9]:
voc = [UNK, PAD, NAN]
for word in vectorizer.vocabulary_.keys():
    voc.append(word)
    #if word in model:
    #    voc.append(word)
    #else:
    #    print(word)
    
print(len(voc))

7434
7437


In [10]:
vocab_map = {}
with open("vocab.txt", "w") as f:
    for i, w in enumerate(voc):
        f.write(w + " " + str(i) + "\n")
        vocab_map[w] = i

In [11]:
def text_to_index(text):
    text = text.strip()
    if text == "":
        return str(vocab_map[NAN])
    
    index = ""
    tokens = text.split()
    for t in tokens:
        if t in vocab_map:
            index = index + " " + str(vocab_map[t])
        else:
            index = index + " " + str(vocab_map[UNK])
    return index.lstrip()

def label_to_int(label):
    if label == "A":
        return 0
    elif label == "B":
        return 1
    elif label == "C":
        return 2
    elif label == "D":
        return 3
    else:
        raise ValueError()
        
print(text_to_index("my name sadasd"))
print(label_to_int("B"))

7204 7203 0
1


In [12]:
df.answerA = df.answerA.apply(text_to_index)
df.answerB = df.answerB.apply(text_to_index)
df.answerC = df.answerC.apply(text_to_index)
df.answerD = df.answerD.apply(text_to_index)
df.question = df.question.apply(text_to_index)
df.correctAnswer = df.correctAnswer.apply(label_to_int)

In [13]:
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question,text
0,415,4270 1830,3457 4774 6022 1671 0 2923,5110 5350 6579 492 0 1731,7176 1869 2623 6579 492 0 2691,0,6579 492 1297 7312 6194 1556 2623 1671 6321 1126,the moon is visible to observer on earth becau...
1,158,2618 1377 381,1377 381 2618,381 2618 1377,2618 381 1377,3,3622 1305 1126 3590 1126 3891 4668 5109 42 325...,which order of succession of natural community...
2,1959,7436 7402,0 2998,0 7121,7436 4423,1,3253 6579 287 6999 539 3756 0 0 3137 1126 7436...,if the solar system were used a a model of an ...
3,2542,4595 5301 1575 5320 6958 5110 2317 4774 1242,4595 5645 5110 2317 2782 80 6194 4114 6958 7072,4595 5504 5110 2317 3427 4741 6118 6035 3131 1...,4595 5669 2832 2317 2782 80 1291 6022 0 2894 423,1,3622 1126 6579 5420 564 1231 5694 6579 1616 11...,which of the following statement best describe...
4,1059,2836,3964,35,6122 6117,0,3622 2063 1126 3340 1071 3117 3756 6194 144 65...,which unit of measurement can be used to descr...


In [14]:
dftrain = df.iloc[:3600]
dfval = df.iloc[3600:]

In [15]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 8 columns):
id               3600 non-null object
answerA          3600 non-null object
answerB          3600 non-null object
answerC          3600 non-null object
answerD          3600 non-null object
correctAnswer    3600 non-null int64
question         3600 non-null object
text             3600 non-null object
dtypes: int64(1), object(7)
memory usage: 225.1+ KB


In [16]:
dfval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 3600 to 4609
Data columns (total 8 columns):
id               1010 non-null object
answerA          1010 non-null object
answerB          1010 non-null object
answerC          1010 non-null object
answerD          1010 non-null object
correctAnswer    1010 non-null int64
question         1010 non-null object
text             1010 non-null object
dtypes: int64(1), object(7)
memory usage: 63.2+ KB


In [17]:
dftrain.to_csv("train_processed.csv",index=False)
dfval.to_csv("val_processed.csv",index=False)

### Test Data Processing

In [18]:
dftest = pd.read_csv("science_bot/test.csv")
dftest.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,question
0,2620,amphibians,birds,mammals,reptiles,"One type of animal hatches from an egg, breath..."
1,2187,Element 1 is polished to form a smooth surface.,Element 2 is heated and evaporates.,"Element 3 develops a white, powdery surface af...",Element 4 is separated from a mixture by filtr...,Which is a chemical change?
2,2275,seismograph,surface wave graph,magnitude graph,intensity graph,What type of instrument is used to record eart...
3,2510,digestive,excretory,immune,respiratory,"Which body system typically recognizes, attack..."
4,1093,watching television,smoking cigarettes,eating candy,exercising every day,Which activity is an example of a good health ...


In [19]:
dftest.info()
dftest.fillna("", inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 6 columns):
id          516 non-null int64
answerA     516 non-null object
answerB     516 non-null object
answerC     513 non-null object
answerD     516 non-null object
question    516 non-null object
dtypes: int64(1), object(5)
memory usage: 24.3+ KB


In [20]:
dftest.answerA = dftest.answerA.apply(clean_text)
dftest.answerB = dftest.answerB.apply(clean_text)
dftest.answerC = dftest.answerC.apply(clean_text)
dftest.answerD = dftest.answerD.apply(clean_text)
dftest.question = dftest.question.apply(clean_text)

dftest.answerA = dftest.answerA.apply(text_to_index)
dftest.answerB = dftest.answerB.apply(text_to_index)
dftest.answerC = dftest.answerC.apply(text_to_index)
dftest.answerD = dftest.answerD.apply(text_to_index)
dftest.question = dftest.question.apply(text_to_index)

dftest.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,question
0,2620,7246,2414,3232,6185,3131 1756 1126 4082 5891 6022 7436 4173 2356 4...
1,2187,1283 1297 3177 6194 2215 0 6345 2691,1283 1297 6063 2317 1772,1283 924 0 4747 3642 2691 799 2464 5350 1069,1283 1297 2901 6022 0 1819 2749 6296,3622 1297 0 5185 6519
2,2275,1946,2691 1002 4340,2927 4340,882 4340,4879 1756 1126 3202 1297 3756 6194 4960 4765
3,2510,853,5102,2050,2907,3622 3587 6999 5328 5693 5041 2317 5187 2758 3...
4,1093,5539 955,1911 1364,6079 5944,7415 1202 6739,3622 6012 1297 7436 140 1126 0 6218 2566 5598


In [21]:
dftest.to_csv("test_processed.csv",index=False)

### Embedding Matrix

In [23]:
embadding_mat = np.zeros((len(voc), 300))
for i, word in enumerate(voc):
    if word in model:
        embadding_mat[i] = model[word]

In [33]:
import pickle
with open("emb_mat.pkl", "wb") as outfile:
    pickle.dump(embadding_mat, outfile)