In [None]:
import * as Utils from 'causal-net.utils';
import * as Log from 'causal-net.log';
import * as Storage from 'causal-net.storage';
import * as Preprocessing from 'causal-net.preprocessing';
import * as Memory from 'causal-net.memory';
import { causalNetCore } from 'causal-net.core';
import { causalNetSGDOptimizer } from 'causal-net.optimizers';
import { causalNetModels } from "causal-net.models";
import * as Sampling from 'causal-net.sampling';
import * as fs from 'fs';
var R = causalNetCore.CoreFunction;
var T = causalNetCore.CoreTensor;
var { Loss } = causalNetModels.skipGram();
var { Stream } = Utils;
var { termLogger } = Log;
var { indexDBStorage } = Storage;
var { nlpPreprocessing } = Preprocessing;
var { causalNetMemory } = Memory;
var { causalNetSampling } = Sampling; 
var optimizers = causalNetSGDOptimizer;

In [None]:
// var docs = ['He is the king','The king is royal', 'She is the royal queen'];

In [None]:
var corpus = '' + fs.readFileSync('../datasets/text8/small.txt')
var docs = corpus.split('\n');

In [None]:
function* skipGramContext(docTokens, windowSize){
    for(let tokens of docTokens){
        for(let index of R.range(0, tokens.length)){
            let target = tokens[index];
            for(let wid of R.range(-windowSize, windowSize)){
                let context = tokens[index + wid];
                if(context !== undefined && target !== context){
                    yield [target, context];
                }
            }
        }    
    }
};
var docTokens = [];
for(let raw of docs){
    docTokens.push(nlpPreprocessing.tokenize(raw));
}
var vocab = R.compose(R.uniq, R.flatten)(docTokens);
var word2int = R.compose(R.fromPairs, R.addIndex(R.map)((w,i)=>[w,i]))(vocab);
var int2word = R.compose(R.fromPairs, R.addIndex(R.map)((w,i)=>[i,w]))(vocab);
var trainTargets = [], trainContexts = [], trainNegContexts = [];
let nonContext = R.filter((x)=>R.indexOf(x, [word2int[target], word2int[context]])===-1)
                            (R.range(0, vocab.length));
for(let [target, context] of skipGramContext(docTokens, 2)){
    
//     console.log([target, context, word2int[target], word2int[context], nonContext]);
    trainTargets.push(word2int[target]);
    trainContexts.push(word2int[context]);
    trainNegContexts.push(nonContext);
}
console.log(vocab.length);
console.log(trainNegContexts);

In [None]:
var VocabLen = vocab.length, EmbeddingSize = 50;
var UVecs = T.variable(T.randomNormal([VocabLen + 1, EmbeddingSize]));
var VVecs = T.variable(T.randomNormal([VocabLen + 1, EmbeddingSize]));

In [None]:
var adam = optimizers.adam({learningRate: 0.2});
function trainLabel(targetId, contextId){
    return adam.fit(()=>{
        let [UEmbed, Ubias] = UVecs.split([VocabLen, 1], 0);
        let [VEmbed, Vbias] = VVecs.split([VocabLen, 1], 0);
        let enc = T.oneHot(targetId, VocabLen).matMul(UEmbed).add(Ubias);
        let dec = enc.add(Vbias).matMul(VEmbed.transpose());
        let logProb = T.oneHot(contextId, VocabLen).mul(dec.sub(dec.logSumExp(1, true))).neg().mean();
        return logProb;
    }, [UVecs, VVecs]);
}
for(let epoch of R.range(0,500)){
    trainLabel(trainTargets, trainContexts).print();
}

async function checkTopMatch(words, embedding, k=3){
    function normalize(vecs){
        let meanTs = vecs.mean(1, true);
        let stdTs = vecs.sub(meanTs).pow(2).mean(1, true).pow(0.5);
        return vecs.sub(meanTs).div(stdTs);    
    }
    function getMatchScore(slotIdxs, normVecs){
        let cTs = normVecs.gather(slotIdxs);
        let similarityScore = normVecs.dot(cTs.transpose());
        return similarityScore;
    }
    var norms = normalize(embedding);
    for(let w of words){
        let wid = word2int[w];
        let matchScores = getMatchScore([wid], norms);
        let {values, indices} = matchScores.transpose().topk(k);
        let idxs = await indices.data();
        let [targetW, ...similarWs] = Array.from(idxs).map(i=>int2word[i]);
        console.log(`[${targetW}\t] is similar to: ${similarWs.join(', ')}`)
    }
}
var embed = UVecs.split([VocabLen, 1], 0)[0];
checkTopMatch(vocab, embed, 7);

In [None]:
R.filter((x)=>R.indexOf(x,[1,2])===-1)([1,2,3])

In [None]:
var adam = optimizers.adam({learningRate: 0.1});
function trainLabel(targetId, contextId, negContextId){
    console.log(negContextId.length);
    return adam.fit(()=>{
        let [UEmbed, Ubias] = UVecs.split([VocabLen, 1], 0);
        let [VEmbed, Vbias] = VVecs.split([VocabLen, 1], 0);
        let posU = UEmbed.gather(targetId).reshape([-1, 1, EmbeddingSize]);
        let posV = VEmbed.gather(contextId).reshape([-1, EmbeddingSize, 1]); 
        let pos = posU.matMul(posV).logSigmoid().mean();
        for(let bId of R.range(0, targetId.length)){
            let negU = UEmbed.gather(R.repeat(targetId[bId], negContextId[bId].length))
                                .reshape([-1, 1, EmbeddingSize]);
            let negV = VEmbed.gather(negContextId[bId])
                                .reshape([-1, EmbeddingSize, 1]);
            pos.add(negU.matMul(negV).neg().logSigmoid().mean());
        }
        let logProb = pos.neg().mean();
        return logProb;
    }, [UVecs, VVecs]);
}
for(let epoch of R.range(0,500)){
    trainLabel(trainTargets, trainContexts, trainNegContexts).print();
}

async function checkTopMatch(words, embedding, k=3){
    function normalize(vecs){
        let meanTs = vecs.mean(1, true);
        let stdTs = vecs.sub(meanTs).pow(2).mean(1, true).pow(0.5);
        return vecs.sub(meanTs).div(stdTs);    
    }
    function getMatchScore(slotIdxs, normVecs){
        let cTs = normVecs.gather(slotIdxs);
        let similarityScore = normVecs.dot(cTs.transpose());
        return similarityScore;
    }
    var norms = normalize(embedding);
    for(let w of words){
        let wid = word2int[w];
        let matchScores = getMatchScore([wid], norms);
        let {values, indices} = matchScores.transpose().topk(k);
        let idxs = await indices.data();
        let [targetW, ...similarWs] = Array.from(idxs).map(i=>int2word[i]);
        console.log(`[${targetW}\t] is similar to: ${similarWs.join(', ')}`)
    }
}
var embed = UVecs.split([VocabLen, 1], 0)[0];
checkTopMatch(vocab, embed, 7);

In [None]:
function WordCoOccurentTraining(W, Wpos, Wneg, Vectors){
    var nce = ()=>{
        let Wpost = Wpos.map(w=>Vectors[w]);
        let Wnegt = Wneg.map(w=>Vectors[w]);
        let Wt = [...Wpost, ...WnegT].map(()=>Vectors[W]);
            return ((w, pos, neg)=>{
                let [PosLen, NegLen] = [pos.length, neg.length];
                var Wa = T.concat(Wt);
                let [Ws, Size] = Wa.shape;
                console.log(Wa.shape, PosLen , NegLen,Size);
                Wa = Wa.reshape([PosLen + NegLen,1,Size]);
//                 Wa.print();
                var Wb = T.concat([ T.concat(pos), T.concat(neg) ]);
//                 Wb.print();
                Wb = Wb.reshape([ PosLen + NegLen, Size, 1]);
                var label = T.concat([T.ones([PosLen]), T.ones([NegLen]).neg()]);
                return Loss(Wa.matMul(Wb).reshape([PosLen + NegLen]), label).neg();
                })(Wt, Wpost, Wnegt);
            };
    return adam.fit(nce);
};

In [None]:
(async ()=>{
    var vector = {0: T.variable(T.tensor([1,2,3,4,5]).reshape([1, 5])),
                  1: T.variable(T.tensor([1,2,3,4,5]).reshape([1, 5])),
                  2: T.variable(T.tensor([1,2,3,4,5]).reshape([1, 5]))}
    var W =    [0, 0, 0, 0, 0];
    var Wc1 =  [1, 1, 1]; 
    var Wnc1 = [2, 2];
    WordCoOccurentTraining(vids, vector).print();
    for(let v of Object.keys(vector)){
        vector[v].print();   
    }
//     const CheckResult = async ()=>{
//             let targetWords = [0,1,2,3,4,5];
//             let topKTensor = await memory.getTopKSimilar(targetWords, 10);
//             topKTensor.print();    
//     }
//     await CheckResult();     
})();    

In [None]:
var memory;
(async ()=>{
    memory = causalNetMemory;
    let initTensor = await memory.initMemory([15, 2]);
})();

In [None]:
var Vocab;
(async ()=>{
    Vocab = {
        words: {'a':0,'b':1,'c':2},
        iwords: {'0':'a','1':'b','2':'c'},
        wCounts: [1,2,3],
        indexToWord: function(idxs){
            idxs = Array.from(idxs);//clone avoid bufferArray issues
            return idxs.map((idx)=>this.iwords[idx]);
        },
        wordToIndex: function(ws){
            return ws.map((w)=>this.words[w]);
        },
        countToProb: function(){
            let countTotal = R.sum(this.wCounts);
            let wFracs = R.map((count)=>count / countTotal, this.wCounts);
            this.wProbs = R.map((frac)=>Math.sqrt(frac / 0.001 + 1)*(0.001 / frac), wFracs); 
            return this.wProbs;
        },
        samplingNegIndexs: function(positives, size){
            return causalNetSampling.negSampling(size, positives, this.wProbs);    
        }
    };
    termLogger.log(Vocab.indexToWord([1,2]));
    termLogger.log(Vocab.wordToIndex(['a','b']));
    termLogger.log(Vocab.countToProb());
    termLogger.log(Vocab.samplingNegIndexs([1,2,3], 6));
})();

In [None]:
(async ()=>{
    let initTensor = await memory.initMemory([10, 5]);
})();

In [None]:
(async ()=>{  
    for(let epoch in R.range(0, 501)){
        termLogger.log({epoch});
        let uids = R.range(0,5);
        let nuid = R.reverse(R.range(0,5));
        let nvid = R.range(5,10);
        uids = [...uids, ...uids];
        let vecId = new Set([...uids, ...nuid, ...nvid]);
        let vectors = {};
        for(let v of vecId){
            vectors[v] = await memory.readSlots([v]);
        }
        WordCoOccurentTraining(uids, nuid, nvid, vectors).print();
        for(let v of vecId){
            await memory.writeSlots([v], vectors[v]);
        }
        uids = R.range(5,10);
        nuid = R.reverse(R.range(5,10));
        nvid = R.range(0,5);
        uids = [...uids, ...uids];
        vecId = new Set([...uids, ...nuid, ...nvid]);
        vectors = {};
        for(let v of vecId){
            vectors[v] = await memory.readSlots([v]);
        }
        WordCoOccurentTraining(uids, nuid, nvid, vectors).print();
        for(let v of vecId){
            await memory.writeSlots([v], vectors[v]);
        }
   
    }
    const CheckResult = async ()=>{
        let targetWords = [0,1,4,3,2];
        let norm = await memory.normalize();
        //  norm.print();
        let matchScore = await memory.getMatchScore(targetWords);
        //  matchScore.print();
        let topKTensor = await memory.getTopKSimilar(targetWords, 5);
        topKTensor.print();    
    };
    await CheckResult();   
})();

In [None]:
var remainingChars = '', wordFreqCount = {}, lineIndex = 0;
function tranformFn(chunkData, chunkEncoding, afterTransformFn){
    let sampleText = remainingChars + chunkData;
    let sampleLines = sampleText.split('\n');
    let transformedData = [], counter = 0;
    for(let line of sampleLines){
        counter += 1;
        if(counter === sampleLines.length){//last line
            remainingChars = line;
        }
        else{
            if(line.length > 0){
                let tokens = nlpPreprocessing.tokenize(line);
                wordFreqCount = nlpPreprocessing.WordFreqCount(tokens, wordFreqCount);
                lineIndex += 1;
                //console.log({line, lineIndex, tokens});
                transformedData.push({lineIndex, tokens});
            }
        }
    }
    afterTransformFn(null, transformedData);
};

In [None]:
function writeTokens(transformedData, chunkEncoding, afterWriteFn){
    const WriteTokensToFile = async (transformedData)=>{
        for(let {lineIndex, tokens} of transformedData){
            console.log({lineIndex});
            await indexDBStorage.writeFile(`/corpus/line_${lineIndex}`, JSON.stringify(tokens));
        }
    }
    WriteTokensToFile(transformedData).then(()=>{
        afterWriteFn();
    });
}

In [None]:
var allTokens = [], tokenMatrix = null, Words=[], wordCounts=[];
(async ()=>{
    var corpusReader = fs.createReadStream('../datasets/text8/text8.txt');
    let writer = Stream.makeWritable(writeTokens);
    let transformer = Stream.makeTransform(tranformFn);
    let deletedFiles = await indexDBStorage.deleteFileByPrefix('/corpus/');
    termLogger.log({deletedFiles});
    const DataProgress = (dataBuffer)=> termLogger.log({'data length': dataBuffer.length});
    const CorpusStreamer = Stream.makePipeline([corpusReader, transformer, writer], DataProgress);
    let result = await CorpusStreamer;
    
    const SortByFreq = R.sortBy(([w,f])=>-f);
    let vocabFreqPairs = R.filter(([w,f])=>f>0)(SortByFreq(R.toPairs(wordFreqCount)));
    let [choosePairs, filterPairs] = R.splitAt(10000, vocabFreqPairs);  
    termLogger.log({'keep length': choosePairs.length, 'discard': filterPairs.length})
    const GetVocab = ([v,f]) => v;
    const GetFreq = ([v,f]) => f;
    Words = R.map(GetVocab, choosePairs);
    wordCounts = R.map(GetFreq, choosePairs);
})();
termLogger.log([Words.length, wordCounts.length]);
var wordMapper = R.compose( R.fromPairs, R.addIndex(R.map)((v,i)=>[v, i]) )(Words);
var iwordMapper = R.compose( R.fromPairs, R.addIndex(R.map)((v,i)=>[i, v]) )(Words);

In [None]:
var wordMapper = R.compose( R.fromPairs, R.addIndex(R.map)((v,i)=>[v, i]) )(Words);
var iwordMapper = R.compose( R.fromPairs, R.addIndex(R.map)((v,i)=>[i, v]) )(Words);
termLogger.log(Object.keys(wordMapper).length);
termLogger.log(Object.keys(iwordMapper).length);

In [None]:
var Vocab = {
    words: wordMapper,
    iwords: iwordMapper,
    wCounts: wordCounts,
    indexToWord: function(idxs){
        idxs = Array.from(idxs);//clone avoid bufferArray issues
        return idxs.map((idx)=>this.iwords[idx]);
    },
    wordToIndex: function(ws){
        return ws.map((w)=>this.words[w]);
    },
    countToProb: function(){
        let countTotal = R.sum(this.wCounts);
        console.log(countTotal);
        let wFracs = R.map((count)=>count / countTotal, this.wCounts);
        //rebalancing for rare words
        this.wProbs = R.map((frac)=>Math.sqrt(frac / 0.001 + 1)*(0.001 / frac), wFracs); 
        return this.wProbs;
    },
    samplingNegIndexs: function(positives, size){
        return causalNetSampling.negSampling(size, positives, this.wProbs);    
    }
};
var _prob = Vocab.countToProb();
_prob

In [None]:
termLogger.log(Vocab.indexToWord([0, 1, 2]));
termLogger.log(Vocab.wordToIndex(['a','the111']));
termLogger.log(Vocab.samplingNegIndexs([1,2,3], 6));

In [None]:
var MatrixInit = (s)=>R.map(()=>R.map(()=>0)(R.range(0,s)))(R.range(0,s))
var CooccurenceMatrixReducer = R.reduce((total, token)=>{
    let [w, ctx] = token;
    let r = w, c = ctx;
    try{
       total[r][c] += 1; 
    }
    catch(e){
        console.error(token)
    }
    return total;
});

In [None]:
function getTokenContext(tokens, windownSize, batchSize=2){
    function context({tokenContext, cooccurents}, token, index){
        let leftSizeContext = [], rightSizeContext = [], occur = new Set([token]);
        for(let w_idx of R.range(1, windownSize+1)){
            if(index - w_idx >= 0){
                occur.add(tokens[index - w_idx]);
            }
            if(index + w_idx < tokens.length){
                occur.add(tokens[index + w_idx]);
            }
        }
        tokenContext.push(token);
        cooccurents.push(Array.from(occur));
        return {tokenContext, cooccurents};
    }
    let { tokenContext, cooccurents } = R.addIndex(R.reduce)(context,
             { tokenContext:[], cooccurents:[] }, tokens);    
    
    return {tokenContext, cooccurents};
}
var tokens = [9,2,3,45,5,6];
var {tokenContext, cooccurents } = getTokenContext(tokens, 3);
console.log(R.zip(tokenContext, cooccurents));
var tokens = [9,2,3,45,5,6];
var {tokenContext, cooccurents } = getTokenContext(tokens, 3);
console.log(R.zip(tokenContext, cooccurents));

In [None]:
(async ()=>{
    var VocabLen = Vocab.wCounts.length;
    console.log(VocabLen);
//     var tokenMatrix = MatrixInit(VocabLen, VocabLen);
    await memory.initMemory([VocabLen, 100]);
})();       

In [None]:
(async ()=>{    
    let uTensor = await memory.readSlots([0]);
    let nuTensor = await memory.readSlots([1, 2, 3, 4, 5, 6]);
    let nvTensor = await memory.readSlots([8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]);
    WordCoOccurentTraining(uTensor, nuTensor, nvTensor);
    await memory.writeSlots([0], uTensor);
    await memory.writeSlots([1,2], nuTensor);
    await memory.writeSlots([8,9], nvTensor);
    const CheckResult = async ()=>{
        let targetWords = [0,1,2,3,4,5];
        let topKTensor = await memory.getTopKSimilar(targetWords, 10);
        topKTensor.print();
        let topks = R.splitEvery(10, Array.from(await topKTensor.data()));
        for(let [w, sim] of R.zip(targetWords, topks)){
            let [tw, ...sws] = Vocab.indexToWord(sim);
            termLogger.log(`[${tw}] is similar to ${sws.join(',')}`);
        }    
    }
    await CheckResult();
})();     

In [None]:
memory.writeSlots

In [None]:
(async ()=>{
    let listFiles = await indexDBStorage.getFileList('/corpus/');
    let getlist = listFiles.slice(0,1);//take all
    let startTime = new Date();
    for(let epoch in R.range(0, 100)){
        for(let lfile of getlist){
            termLogger.log({lfile, elapse: new Date() - startTime});
            let rawtokens = await indexDBStorage.readFile(lfile);
            let tokens = JSON.parse(rawtokens);
            const tokenIdxs = Vocab.wordToIndex(tokens);
            
            const FilterUndefined = R.filter((v)=>v!==undefined);
            let seletedTokens = FilterUndefined(tokenIdxs);
            let loss = [];
            var {tokenContext, cooccurents} = getTokenContext(seletedTokens, 2);
//             console.log(R.zip(tokenContext, cooccurents), seletedTokens);
            for(let [w, posWs] of R.zip(tokenContext, cooccurents)){            
                
                let negWs = Vocab.samplingNegIndexs(posWs, 10);
//                 console.log({w, posWs, negWs});
                let uTensor = await memory.readSlots([w]);
                let nuTensor = await memory.readSlots(posWs);
                let nvTensor = await memory.readSlots(negWs);
                
                let l = await WordCoOccurentTraining(uTensor, nuTensor, nvTensor).data();
                loss = [...loss, ...l];
                await memory.writeSlots([w], uTensor);
                await memory.writeSlots(posWs, nuTensor);
                await memory.writeSlots(negWs, nvTensor);
            }
            console.log({epoch, lfile, loss: R.mean(loss)});
        }
        const CheckResult = async ()=>{
            let targetWords = [0,1,2,3,4,5];
            let topKTensor = await memory.getTopKSimilar(targetWords, 10);
            topKTensor.print();
            let topks = R.splitEvery(10, Array.from(await topKTensor.data()));
            for(let [w, sim] of R.zip(targetWords, topks)){
                let [tw, ...sws] = Vocab.indexToWord(sim);
                termLogger.log(`[${tw}] is similar to ${sws.join(',')}`);
            }    
        }
        await CheckResult();    
    }
    // tokenMatrix = CooccurenceMatrixReducer(tokenMatrix)(tokenContexts);
})();

In [None]:
fs.writeFileSync('./cooccurent.matrix.json', JSON.stringify(tokenMatrix));
fs.writeFileSync('./tokenMapper.json', JSON.stringify(mapper));