In [1]:
from fastai import *
from fastai.text import *
from fastai.datasets import *
from pathlib import Path
import html
import fastai

In [2]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic=True

In [3]:
# Check that the latest 1.0.0 build of PyTorch has been installed 
# alongside fastai
import torch
print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

Cuda available
PyTorch version:  1.0.0.dev20190320


In [4]:
PATH=Path('/home/dell/Code/fastai_examples/pubmed-rct-200k/')
PATH.mkdir(exist_ok=True)
DATA_PATH=Path('/home/dell/Code/fastai_examples/data/pubmed-rct-200k')
DATA_PATH.mkdir(exist_ok=True)

In [5]:
#path = untar_data("http://files.fast.ai/data/examples/imdb_sample.tgz",dest=DATA_PATH)
#path = Path('/home/dell/Code/fastai_examples/data/imdb_sample/')

In [6]:
dataProcess = False
modelTrain = False

In [7]:
if dataProcess:
    testData = pd.read_csv(DATA_PATH/'test.txt', sep="\t", header=None, comment='#')
    testData.columns = ["label", "text"]
    testData.dropna(inplace=True)
    testData.head()

In [8]:
if dataProcess:
    trainData = pd.read_csv(DATA_PATH/'train.txt', sep="\t", header=None, comment='#')
    trainData.columns = ["label", "text"]
    trainData.dropna(inplace=True)
    trainData.head()

In [9]:
import pixiedust

Pixiedust database opened successfully


In [10]:
#%%pixie_debugger

if dataProcess:
  # Language model data
  data_lm = TextLMDataBunch.from_df(PATH,train_df=trainData, valid_df = testData)
  data_lm.save(PATH/'data_lm_export.pkl')
  # Classifier model data
  data_clas = TextClasDataBunch.from_df(PATH, train_df=trainData,valid_df = testData , \
                                        vocab=data_lm.train_ds.vocab, bs=32)
  data_clas.save(PATH/'data_clas_export.pkl')
else:
  data_lm = load_data(PATH, file='data_lm_export.pkl')
  data_clas = load_data(PATH, file='data_clas_export.pkl', bs=16)



In [11]:
if modelTrain:
    learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5, pretrained=False)
    learn.lr_find()
    learn.recorder.plot()

In [12]:
if modelTrain:
    preTrainedWt103Path = Path('/home/dell/Code/fastai_examples/data/models/wt103')
    learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5, pretrained=False)
    learn.load_pretrained(wgts_fname = preTrainedWt103Path/'fwd_wt103.h5', itos_fname = preTrainedWt103Path/'itos_wt103.pkl', strict=False )
    learn.fit_one_cycle(20, 1e-2)
    pathModel = learn.save(PATH/"trained_model")
else:
    learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5, pretrained=False)
    learn.load(PATH/"trained_model")

In [13]:
print(fastai.__version__)

1.0.52.dev0


In [14]:
#%%pixie_debugger
#learn.predict("TP53 is an important gene ", n_words=30)

In [15]:
x_inp = ["i like apples",
         "i want to buy some apples",
         "where is your cell phone"]

In [182]:
x_inp2 = ["i like apples and orange",
          "i am going to buy some apples and orange",
          "you like apples and orange",
         "you hate all fruits especially apples and orange"
         ]

In [183]:
x_inp3 = ["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges."]

In [184]:
x_inp4 = ["acupuncture has no systemic hazard for pregnancy",
         "anxiety might have harmful effects for pregnancy",
         "anxiety is bad for pregnancy"]

In [185]:
x_inp5 = ["protein",
         "gene",
         "cell",
         "asthma",
         "lung"]

In [227]:
similarity_test_input = x_inp3

In [228]:
import multiprocessing

n_cpu = multiprocessing.cpu_count()


In [229]:
tok = Tokenizer().process_all(partition_by_cores(similarity_test_input,n_cpu-1))

In [230]:
for t in tok:
    print(t[2:-2])

['let', "'s", 'talk', 'about', 'fruits', 'for', 'a', 'second', '.', 'xxmaj', 'apples', 'are', 'nice', '.', 'xxmaj', 'oranges', 'too', '.', 'i', 'kinda', 'like', 'them', '.']
['i', 'compared', 'the', 'prices', 'of', 'apples', 'and', 'oranges', 'at', 'walmart', 'and', 'kroger', 'stores']
['oh', 'you', 'wanna', 'talk', 'about', 'apples', '.', 'sure', '.', 'i', 'am', 'not', 'sure', 'if', 'i', 'have', 'said', 'this', 'before', 'but', 'i', 'do', 'like', 'them', 'and', 'oranges', '.']


In [231]:
X = [[data_lm.vocab.stoi[o1] for o1 in o if data_lm.vocab.stoi[o1] != 0] for o in tok];
for truncX in X:
    print(truncX[2:-2])

[246, 11221, 459, 6401, 24, 19, 415, 9, 4, 25410, 92, 16975, 9, 4, 4527, 9, 300, 1624, 1204]
[300, 56, 10, 13808, 11, 25410, 12, 30, 12, 7925]
[2764, 11760, 11221, 459, 25410, 9, 21590, 9, 300, 2955, 45, 21590, 378, 300, 132, 10107, 40, 145, 89, 300, 1113, 1624, 1204, 12, 9]


In [232]:
m = learn.model

In [233]:
m[0]

AWD_LSTM(
  (encoder): Embedding(60003, 400, padding_idx=1)
  (encoder_dp): EmbeddingDropout(
    (emb): Embedding(60003, 400, padding_idx=1)
  )
  (rnns): ModuleList(
    (0): WeightDropout(
      (module): LSTM(400, 1150, batch_first=True)
    )
    (1): WeightDropout(
      (module): LSTM(1150, 1150, batch_first=True)
    )
    (2): WeightDropout(
      (module): LSTM(1150, 400, batch_first=True)
    )
  )
  (input_dp): RNNDropout()
  (hidden_dps): ModuleList(
    (0): RNNDropout()
    (1): RNNDropout()
    (2): RNNDropout()
  )
)

In [234]:
#%%pixie_debugger
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()

embeddings = []
with torch.no_grad():
    for sentence in X:
        #resizeLength = len(x_inp4)
        resizeLength = len(sentence[2:-2])
        input_sentence = tensor(sentence[2:-2])
        input_sentence.resize_(resizeLength,1)
        # sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
        tmpEmbded = m[0](input_sentence.cuda())
        embeddings.append(tmpEmbded[0][2].mean(0)[-1])

In [235]:
#kk0=m[0](input0.cuda())
#kk1=m[0](input1.cuda())
#kk2=m[0](input2.cuda())
#kk3=m[0](input3.cuda())

In [236]:
#kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
#kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
#kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims
#kk3 = (kk3[0][2][0][-1]) # 4rd sentence encoding 400 dims

In [237]:
# cosine similarity - to check quality of our sentence encoder
def cos_sim(v1,v2):
    return F.cosine_similarity(tensor(v1).unsqueeze(0),tensor(v2).unsqueeze(0)).mean()

In [238]:
similarity_test_input

["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
 'i compared the prices of apples and oranges at walmart and kroger stores',
 'oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges.']

In [239]:
for i in range(len(embeddings)):
    for j in range(len(embeddings)):
        print(similarity_test_input[i] + "\n... with ....\n" + similarity_test_input[j])
        print(cos_sim(embeddings[i],embeddings[j]).item())
    print("\n")

let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.
... with ....
let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.
1.0
let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.
... with ....
i compared the prices of apples and oranges at walmart and kroger stores
0.42818450927734375
let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.
... with ....
oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges.
0.6155552268028259


i compared the prices of apples and oranges at walmart and kroger stores
... with ....
let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.
0.42818450927734375
i compared the prices of apples and oranges at walmart and kroger stores
... with ....
i compared the prices of apples and oranges at walmart and kroger stores
1.0
i compared the prices o