In [1]:
import torch
import pathlib
import seaborn as sns

import pandas as pd
import pickle
from utils.fileIterator import fileIterator
from utils.tokenizer import wordPieceTokenizer
from utils.funcTrackers import trackFunctionCall

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
@trackFunctionCall
def getTokenizer(tokenizerFilepath:pathlib.Path,newTokenizer=False):
    if not newTokenizer:
        with tokenizerFilepath.open("rb") as pickleDumpFile:
            return pickle.load(pickleDumpFile)
        
    with open("datafiles/DSSM/vocab_Q.wl") as vocabFile:
        wordInpVocab = vocabFile.read().strip().split("\n")
    with open("datafiles/DSSM/vocab_A.wl") as vocabFile:
        wordOutVocab = vocabFile.read().strip().split("\n")

    tokenizer = wordPieceTokenizer(wordInpVocab,wordOutVocab)

    with open("tokenizerIndexer.pkl","wb") as pickleDumpFile:
        pickle.dump(tokenizer,pickleDumpFile,pickle.HIGHEST_PROTOCOL)
    return tokenizer

In [4]:
@trackFunctionCall
def getGroupedData(dataFrame,toGroup):
    groupedDf = dataFrame.groupby(toGroup).groups
    newDf     = pd.DataFrame(columns=["inpSeq","outSeq"])
    dfRowsLen = newDf.shape[0]
    for sentencePairTag in groupedDf.keys():
        sentencePairIndexes   = groupedDf[sentencePairTag]
        subDf                 = dataFrame.loc[sentencePairIndexes]
        inpSeq                = " ".join(subDf["inpSeq"]).strip()
        outSeq                = " ".join(subDf["outSeq"]).strip()
        newDf.loc[dfRowsLen]  = [inpSeq,outSeq]
        dfRowsLen            += 1
    return newDf

In [5]:
@trackFunctionCall
def convertCtf2Csv(filename:pathlib.Path):
    dataFrame = pd.read_csv(str(filename.resolve()),delimiter="|",header=None,names=["pairTag","inpSeq","outSeq"])
    dataFrame["inpSeq"].fillna("S0 \t:1\t",inplace=True)
    dataFrame["outSeq"].fillna("S1 \t:1"  ,inplace=True)
    
    funcToRemoveTags = lambda x:x.split(" ")[-1].split(":")[0]
    dataFrame["inpSeq"] = dataFrame["inpSeq"].apply(funcToRemoveTags)
    dataFrame["outSeq"] = dataFrame["outSeq"].apply(funcToRemoveTags)

    basepath     = filename.parent
    filestem     = filename.stem
    subFolder    = "pytorch"
    parentFolder = basepath      / subFolder
    filepath     = parentFolder / f"{filestem}.csv"
    
    cleanDfTrain = getGroupedData(dataFrame,"pairTag")
    cleanDfTrain.to_csv(str(filepath.resolve()),header=False,index=False)

In [6]:
#trainDataFramePath = pathlib.Path("./datafiles/DSSM/train.pair.tok.ctf")
#validDataFramePath = pathlib.Path("./datafiles/DSSM/valid.pair.tok.ctf")

#convertCtf2Csv(trainDataFramePath)
#convertCtf2Csv(validDataFramePath)

In [7]:
class dataSet(torch.utils.data.Dataset):
    def __init__(self,pairIterator,tokenizer):
        self.pairIterator = pairIterator
        self.tokenizer    = tokenizer
    
    def __getitem__(self,index):
        inpSeq,outSeq = self.pairIterator[index]
        inpSeq,outSeq = self.tokenizer.tokenize(inpSeq,"inpSeq"),self.tokenizer.tokenize(outSeq,"outSeq")
        return torch.tensor(inpSeq),torch.tensor(outSeq)
        
    def __len__(self):
        return len(self.pairIterator)

In [8]:
class modelDSSM(torch.nn.Module):
    """
        Think of how we can implement attention in this!!
    """
    def __init__(self,vocabSize,wordEmbeddingDim,hiddenStateDim,sentenceVecDim,fc1Dim,dropout1=0.5,dropout2=0.5,numRnnLayers=1,bidirectional=True,batchFirst=True,debug=True):
        super(modelDSSM,self).__init__()
        self.debug              = debug
        self.wordEmbeddingDim   = wordEmbeddingDim
        self.vocabSize          = vocabSize
        self.sentenceVecDim     = sentenceVecDim
        
        self.hiddenStateDim     = hiddenStateDim
        self.numRnnLayers       = numRnnLayers
        self.bidirectional      = bidirectional
        self.batchFirst         = batchFirst
        self.directions         = 2 if self.bidirectional else 1
        self.initHidenStateVar  = self.numRnnLayers*self.directions #Used in initializing hidden states
        
        self.fc1Dim             = fc1Dim
        self.fc1InpDim          = self.hiddenStateDim*self.directions

        self.dropOut1Th         = dropout1
        self.dropOut2Th         = dropout2
        
        self.wordPieceEmbedding = torch.nn.Embedding(self.vocabSize,self.wordEmbeddingDim)
        self.rnn                = torch.nn.GRU(self.wordEmbeddingDim,self.hiddenStateDim,num_layers=self.numRnnLayers,bidirectional=self.bidirectional,batch_first=self.batchFirst)
        
        self.fc1                = torch.nn.Linear(self.fc1InpDim,self.fc1Dim)
        self.fc2                = torch.nn.Linear(self.fc1Dim,self.sentenceVecDim)
        
        self.dropOut1           = torch.nn.Dropout(self.dropOut1Th)
        self.dropOut2           = torch.nn.Dropout(self.dropOut2Th)
        
    def forward(self,input,rnnInitialHiddenStates):
        # Compute token embeddings(Wordpiece in this case)
        wordPieceEmbeddings = self.wordPieceEmbedding(input)
        wordPieceEmbeddings = self.dropOut1(wordPieceEmbeddings)
        self.printTensorShape("Wordpiece embeddings",wordPieceEmbeddings)

        # Calculate RNN Outputs
        rnnOutputs,_        = self.rnn(wordPieceEmbeddings,rnnInitialHiddenStates)
        self.printTensorShape("RNN",rnnOutputs)
        
        # Compute FC1 outputs and apply relu activation and dropout
        fc1Inpts            = rnnOutputs[:,-1,:]
        fc1Outputs          = self.fc1(fc1Inpts)
        fc1Outputs          = torch.nn.functional.relu(fc1Outputs)
        fc1Outputs          = self.dropOut2(fc1Outputs)
        self.printTensorShape("Fully Connected1",fc1Outputs)

        # Compute sentenceVectors and apply tanh activation
        sentenceEmbeddings  = self.fc2(fc1Outputs)
        sentenceEmbeddings  = torch.nn.functional.tanh(sentenceEmbeddings)
        self.printTensorShape("Final sentence embeddings",sentenceEmbeddings)
        
        return sentenceEmbeddings
        
        
    def printTensorShape(self,layer,tensorUnderObservation):
        if self.debug:
            print(f"{layer} output shapes is: {tensorUnderObservation.shape}")

    def initHidden(self,batchSize):
        hiddenStateShape = (self.initHidenStateVar,batchSize,self.hiddenStateDim)
        if self.debug:
            print (f"initial hidden dimension shape is: {hiddenStateShape}")
        return torch.normal(0,0.5,size=hiddenStateShape)

In [9]:
tokenizer     = getTokenizer(pathlib.Path("./tokenizerIndexer.pkl"),newTokenizer=True)
dataParams    = {
                    "shuffle"   : True,
                    "batch_size" : 1
                }

Making a call to function: getTokenizer
Finished executing function: getTokenizer in 0.007960111999636865 seconds


In [10]:
trainIterator    = fileIterator(pathlib.Path("datafiles/DSSM/pytorch/train.pair.tok.csv"),tokenize=False)
valIterator      = fileIterator(pathlib.Path("datafiles/DSSM/pytorch/valid.pair.tok.csv"),tokenize=False)
   
trainDataIter    = dataSet(trainIterator,tokenizer)
valDataIter      = dataSet(valIterator,tokenizer)

dataLoaders      = {
                    "train" : torch.utils.data.DataLoader(trainDataIter,**dataParams),
                    "val"   : torch.utils.data.DataLoader(valDataIter,**dataParams)
                   }
dataLoadersLens  = {
                    "train" : len(trainDataIter),
                    "val"   : len(valDataIter)
                   }

In [11]:
def noZeroDivision(dividend,divisor,rounOffVal):
    try:
        return round(dividend/divisor,rounOffVal)
    except ZeroDivisionError:
        return 0

In [12]:
@trackFunctionCall
def train(model,optimizer,criterion,phaseList,numStepsBeforeVal,dataLoaders,dataLoadersLen,epochs=1):
    phaseLossDict = {"train":[],"val":[]}
    stepLossDict  = {"train":[],"val":[]}
    for batchedEpoch in range(0,epochs,numStepsBeforeVal):
        for subEpochCounter, phase in enumerate(phaseList,start=1):
            if phase=="train":
                model.train()
            if phase=="val":
                model.eval()
            phaseLoss = 0
            for step,(inpSeq,outSeq) in enumerate(dataLoaders[phase]):                
                with torch.set_grad_enabled(phase=="train"):
                    
                    if phase=="train":
                        optimizer.zero_grad()

                    inpSeq = inpSeq.to(device)
                    outSeq = outSeq.to(device)
                    target = torch.ones(model.sentenceVecDim).to(device)

                    initialHiddenStates = model.initHidden(dataParams["batch_size"]).to(device)
                    sentVectInp = model(inpSeq,initialHiddenStates)

                    initialHiddenStates = model.initHidden(dataParams["batch_size"]).to(device)
                    sentVectOut = model(outSeq,initialHiddenStates)
                    loss        = criterion(sentVectInp,sentVectOut,target)
                    stepLoss    = loss.item()
                    phaseLoss  += stepLoss
                    stepLossDict[phase].append(stepLoss)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
            totalLoss = noZeroDivision(phaseLoss,dataLoadersLen[phase],6)
            phaseLossDict[phase].append(totalLoss)
            print (f"Phase:{phase} --- Step: {batchedEpoch+subEpochCounter} --- epoch loss: {totalLoss}")
    return model,phaseLossDict,stepLossDict

In [13]:
vocabSize          = len(tokenizer.wordPieceMeta["vocab"])
wordEmbeddingDim   = 300
hiddenStateDim     = 512
sentenceVecDim     = 150
fc1Dim             = 256

numStepsBeforeVal  = 25
phaseList          = [*["train"]*numStepsBeforeVal,"val"]

In [14]:
dssmModel          = modelDSSM(vocabSize,wordEmbeddingDim,hiddenStateDim,sentenceVecDim,fc1Dim,debug=False).to(device)
criterion          = torch.nn.CosineEmbeddingLoss(margin=0.5)
optimizer          = torch.optim.RMSprop(dssmModel.parameters(), lr=0.0001)

In [15]:
dssmModel,phaseLossDict,stepLossDict = train(dssmModel,optimizer,criterion,phaseList,numStepsBeforeVal,dataLoaders,dataLoadersLens,epochs=50)

Making a call to function: train




Phase:train --- Step: 1 --- epoch loss: 0.015527
Phase:train --- Step: 2 --- epoch loss: 0.0
Phase:train --- Step: 3 --- epoch loss: 0.0
Phase:train --- Step: 4 --- epoch loss: -0.0
Phase:train --- Step: 5 --- epoch loss: -0.0
Phase:train --- Step: 6 --- epoch loss: -0.0
Phase:train --- Step: 7 --- epoch loss: -0.0
Phase:train --- Step: 8 --- epoch loss: -0.0
Phase:train --- Step: 9 --- epoch loss: -0.0
Phase:train --- Step: 10 --- epoch loss: -0.0
Phase:train --- Step: 11 --- epoch loss: -0.0
Phase:train --- Step: 12 --- epoch loss: -0.0
Phase:train --- Step: 13 --- epoch loss: -0.0
Phase:train --- Step: 14 --- epoch loss: -0.0
Phase:train --- Step: 15 --- epoch loss: 0.0
Phase:train --- Step: 16 --- epoch loss: -0.0
Phase:train --- Step: 17 --- epoch loss: -0.0
Phase:train --- Step: 18 --- epoch loss: -0.0
Phase:train --- Step: 19 --- epoch loss: -0.0
Phase:train --- Step: 20 --- epoch loss: 0.0
Phase:train --- Step: 21 --- epoch loss: -0.0
Phase:train --- Step: 22 --- epoch loss: -0

RuntimeError: CUDA error: unspecified launch failure