## This script is meant to generate the dictionary automatically using the N-Gram Analysis
--------------

### Algorithm steps:
1. Read the cases, stop words, domain words
2. Normalize the cases by removing the stop words and the punctuations
3. Using the countVectorizer find out the tokens and the frequency
4. Take the tokens which are quite frequent
5. These are the unigrams
6. Now break the normalised cases in the step 2 and create the bigrams
7. take the most frequent bigrams
8. Repeat the step 6 and 7 to get the trigrams
9. Now normalize the unigrams so that increasingly, increased, increases, increase all map to increase
10. Use the domain words to further normalise the unigrams so that rot is mapped to rotor
11. Using the normaized unigrams we need to normalize the bigrams and trigrams so that "brg mtl temp" becomes "bearing metal temperature"
---------

In [89]:
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from nltk.metrics import BigramAssocMeasures
import numpy as np

### Initialize the data (later it should be through a database connection)

In [90]:
def initialize():

    #prefixpath="/Users/305015992/pythonProjects/wordcloud/"
    
    #Read teh cases file. This is the raw cases containing the full data and not split at the sentence level
    fname="all_jim_case_large.txt"
    with open(fname, 'r') as myfile:
        data=myfile.read()
    data=data.split('-------BREAK--------')
    cases=[case.strip() for case in data]
    print(cases[1:10])
    print(len(cases))
    lengthCases=len(cases)

    #read the stop words list
    stopwordsFile = open('stopwordsss.txt', 'r')
    stopwords=stopwordsFile.read()
    stopwordList=stopwords.split(",")

    #list of the domain words
    domainFileName='domainss.txt'
    domainLines = [domainLine.rstrip('\n').split(',') for domainLine in open(domainFileName)]
    domainDict={}
    for dl in domainLines:
            key=dl[0].replace('"','')
            value=dl[1].replace('"','')
            domainDict[key]=value.strip()
   
    return cases,stopwordList,domainDict


### Read the data and initialize the varaible

In [91]:
cases,stopwordList,domainDict=initialize()

['It appears while the turbine is active power is moving around the wind speed sensor is returning a flatline value.\n\nUpdate 1-3-14: Appears the Anemometer issues have been resolved as of 1-2-14 at 7:00am', 'Since 11/25/14, the following tags are indicating intermittent flatlines: IDO WTG001 MDK030 BT001:XQ01.M_HSRot_Tmp, IDO WTG001 MDK030 BT002:XQ01.M_IMSGen_Tmp, and IDO WTG001 MDK030 BT003:XQ01.M_IMSRot_Tmp Actual temperatures are acceptable when tags are not flatlining (~65 degC). Update 1/26/15: Flatlines cleared. Closing case.', 'Beginning Aug 8, wind turbine 1 appears curtailed at ~2700 kW with wind speed between ~4-13 m/s.', 'During higher wind conditions the turbine is currently limited to 1.8 MW', 'Since 2/23/15, wind turbine 2 appears to be curtailed at ~2700 kW with wind speed between ~5-21 m/s.  Update 4/13/15: Wind turbine 2 appears to be curtailed since 4/10. Currently, active power is ~2700 kW.  Curtailment cleared.', 'On 11/27/14, generator bearing temperature remaine

### cleanup the cases 

In [92]:
import re
def cleaupCaseData(cases):
    lengthCases=len(cases)
    cleanedUpCases=[]
    for count in range(0,lengthCases):
        # print("before {}",case)
        case=cases[count]
        case=case.lower()
        case = case.strip();
        case = re.sub('/[^A-Za-z0-9 _\-\+\&\,\#]/', '', case)
        case = case.replace('"', ' ')
        case = case.replace('\"', ' ')
        case = case.replace('>', ' ')
        case = case.replace('@', ' ')
        case = case.replace('<', ' ')
        case = case.replace(':', ' ')
        case = case.replace('.', ' ')
        case = case.replace('(', ' ')
        case = case.replace(')', ' ')
        case = case.replace('[', ' ')
        case = case.replace(']', ' ')
        case = case.replace('_', ' ')
        case = case.replace(',', ' ')
        case = case.replace('#', ' ')
        case = case.replace('-', ' ')
        case = case.replace('/', ' ')
        case = case.replace('"', ' ')
        case = case.replace('\n', ' ')
        case = case.replace('~', ' ')
        case = re.sub(r'\d+', ' ', case)
        cleanedUpCases.append(case)
    return cleanedUpCases

In [93]:
cleanedUpCases=cleaupCaseData(cases)
print(cleanedUpCases[1:10])
print(len(cleanedUpCases))

['it appears while the turbine is active power is moving around the wind speed sensor is returning a flatline value   update        appears the anemometer issues have been resolved as of       at    am', 'since        the following tags are indicating intermittent flatlines  ido wtg  mdk  bt  xq  m hsrot tmp  ido wtg  mdk  bt  xq  m imsgen tmp  and ido wtg  mdk  bt  xq  m imsrot tmp actual temperatures are acceptable when tags are not flatlining     degc   update        flatlines cleared  closing case ', 'beginning aug    wind turbine   appears curtailed at    kw with wind speed between      m s ', 'during higher wind conditions the turbine is currently limited to     mw', 'since        wind turbine   appears to be curtailed at    kw with wind speed between      m s   update        wind turbine   appears to be curtailed since      currently  active power is    kw   curtailment cleared ', 'on        generator bearing temperature remained above the model estimate for    hours the maximum

### Create a dictioary of the words and the frequency count
Vocabulary is list of the features obtained using the countVectorizer

Using the dtm to do this
Word->count across all the cases

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=0.001,stop_words=stopwordList,strip_accents='unicode',binary=False)
rawDtm = count_vect.fit_transform(cleanedUpCases)
print("Data dimensions: {}".format(rawDtm.shape))

#vocabulary is the list of the feature names
vocab=count_vect.get_feature_names()
print(len(vocab))

#convert the dtm to np array this will enable us to perform the colsum and rowsum
countDtm = rawDtm.toarray()
countDtm=np.array(countDtm)
#calculate the frequecny sum for all the tokens
freqsum=np.sum(countDtm,axis=0)

# np.amax(freqsum)
# np.amin(freqsum)


freqDict={}
for idx,v in enumerate(vocab):
    #freqDict.append({'word':v,'count':freqsum[idx]})
    freqDict[v]=freqsum[idx]
#print(freqDict)



Data dimensions: (11901, 582)
582


### Addding POS words based on maxent algorithm 
1. read the pos distribution file (currently this is already supplied)
2. We will pick only the nouns and pronouns
3. we will compare the list of pos with the unigrams that we already have
4. add the new words to the unigrams
5. we will keep track of the frequency of the new unigrams as well

In [95]:
import csv
posWords=[]
posWordsDict={}
with open("posDist.csv", 'r') as csvfile:
    posReader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in posReader:
        ch=row[7].replace('"','')
        if(ch=="N"):
            str=row[1].replace('"', '')
            posWords.append(str)
            posWordsDict[str]=row[3].replace('"','')
        if(ch=="J"):
            str = row[1].replace('"', '')
            posWords.append(str)
            posWordsDict[str] = row[2].replace('"', '')


In [96]:
#the utility function to check for the existence of the word in the list
def my_in_array(word,wordlist):
    # print(type(word))
     for key in wordlist:
        if(key==word):
            return(True)
     return(False)


In [97]:
posWordsToBeAdded=[]
#keep the threshold to the currently minimum frequency of the unigrams
minCount=np.amin(freqsum)
#print(minCount)
for k in posWordsDict:
    if (my_in_array(k,unigrams)==False):
        if(my_in_array(k,stopwordList)==False):
            if(int(posWordsDict[k])>minCount):
                print(k,posWordsDict[k])
                posWordsToBeAdded.append(k)
                #add to the exiting frequecy dictionary of the unigram
                freqDict[k]=int(posWordsDict[k])

len(posWordsToBeAdded)

ho 21
angle 20
desuperheater 15
srv 18
wsao 17
inho 39
brgx 13
behaviour 14
teg 15
device 16
damper 27
cell 15
fcv 15
pulverizer 15
calibration 13
lcv 18
reservoir 20
trucks 14
bed 16
charge 16
mark 14
intensity 17
horiz 14


23

#### Creating the sorted list of the freqdict
This is based on the suggestions given at http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value


In [98]:
from operator import itemgetter
sortedFreqDict=sorted(freqDict.items(), key=itemgetter(1),reverse=True)

#### utility functions to access the frequency of a keyword or given a frequecny which all keywords are mapped

In [99]:

def getFreqOfKeyword(freqDict,keyword):
    for key in freqDict:
        if(key==keyword):
            return(freqDict[key])


def getKeywordBasedOnFrequency(freqDict,freq):
    for key in freqDict:
        if (freqDict[key]==freq):
            return (key)



print(getFreqOfKeyword(freqDict,"disch"))
print(getKeywordBasedOnFrequency(freqDict,81))
print(getKeywordBasedOnFrequency(freqDict,np.amin(freqsum)))
print(getKeywordBasedOnFrequency(freqDict,np.amax(freqsum)))

81
disch
noise
bearing


#### The unigrams are same as the keys of the freqDist


In [100]:
unigrams=[]
for k in freqDict:
    unigrams.append(k)
len(unigrams)

605

### Function to find the bigrams
We are using the nltk bigramcollocator for this purpose .. this is based on 
http://stackoverflow.com/questions/21844546/forming-bigrams-of-words-in-list-of-sentences-with-python

Basically for the given case it will find out the bigrams in that case

In [101]:
def get_bigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    ''' stopwordlist is the one that we read in the initialize phase'''
    cleanedTokens=[x for x in tokens if x.lower() not in stopwordList]
    # print(tokens)
    #stemmer = PorterStemmer()
    bigram_finder = BigramCollocationFinder.from_words(cleanedTokens)
    #bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5)
    #print(bigrams)

    '''Returns the list of the bigrams for the case in the form of a dictioary of bigram and its count
    e.g. dict_items([(('power', 'wind'), 1), (('speed', 'sensor'), 1), (('anemometer', 'issues'), 1)]),
    '''
    return bigram_finder.ngram_fd.items()
    

#### Function to find the trigrams 
this is again based on http://stackoverflow.com/questions/21844546/forming-bigrams-of-words-in-list-of-sentences-with-python

In [102]:
def get_trigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    ''' stopwordlist is the one that we read in the initialize phase'''
    cleanedTokens=[x for x in tokens if x.lower() not in stopwordList]
    trigram_finder = TrigramCollocationFinder.from_words(cleanedTokens)
    #trigrams = trigram_finder.nbest(TrigramAssocMeasures.chi_sq, 5)
   
    '''Returns the list of the trigrams for the case in the form osf a dictioanry of trigram and its count'''
    return trigram_finder.ngram_fd.items()
   

#### Now find the bigrams using the get_bigrams()

In [103]:
'''for all the cases find out the bigrams'''
bigramFeatures=[]
for case in cleanedUpCases:
    #need to make sure that the case does not have stop words
    bigramFeatures.append(get_bigrams(case))

print(len(bigramFeatures))


'''Now combine the individual dictioary items together..also count the occurences of the same bigrams'''
bigramCounts={}
for bigramFeature in bigramFeatures:
    for k, v in bigramFeature:
        key=(' '.join(k))
        if key not in bigramCounts.keys():
            bigramCounts[key]=v
        else:
            bigramCounts[key]+=v

print(len(bigramCounts))


'''
Now we will create teh final list of bigrams..they should have the following property
1. The frequency of the bigram should be more than 20
2. both the parts should be unigrams individually
3. both the parts should not be same 'temp temp' is not valid
'''
finalBigramFeatures=[]
for key in bigramCounts:
    if(bigramCounts[key]>20):
        #print(key)
        arrBigrams=key.split()
        if(arrBigrams[0]!=arrBigrams[1]):
            #iff the individual words are part of unigrams then only add it
            if((arrBigrams[0] in unigrams) and (arrBigrams[1] in unigrams)):
                finalBigramFeatures.append(key)

#finalBigramFeatures
print(len(finalBigramFeatures))

11901
31653
732


#### similarly for the trigrams

In [104]:
'''Get the dictioanries for individual cases'''
trigramFeatures=[]
for case in cleanedUpCases:
    trigramFeatures.append(get_trigrams(case))
print(len(trigramFeatures))

'''Combine the dictionaries to get the counts of individual trigrams'''
trigramCounts={}
for trigramFeature in trigramFeatures:
    for k, v in trigramFeature:
        key=(' '.join(k))
        if key not in trigramCounts.keys():
            trigramCounts[key]=v
        else:
            trigramCounts[key]+=v

print(len(trigramCounts))


'''
Now we will create teh final list of trigrams..they should have the following property
1. The frequency of the trigrams should be more than 20
2. all the parts should be unigrams individually
3. all the parts should not be same 'temp temp temp' is not valid
'''
finalTrigramFeatures=[]
for key in trigramCounts:
    if(trigramCounts[key]>20):
        #print(key)
        arrTrigrams=key.split()
        if(arrTrigrams[0]!=arrTrigrams[1]!=arrTrigrams[2]):
            #iff the individual words are part of unigrams then only add it
            if((arrTrigrams[0] in unigrams) and (arrTrigrams[1] in unigrams) and (arrTrigrams[2] in unigrams)):
                finalTrigramFeatures.append(key)

#finalBigramFeatures
print(len(finalTrigramFeatures))



11901
60634
225


## Add the spacy code for the bigrams and trigrams here

In [106]:
'''Utility function to cleanup the nounphrases'''

def cleanupNounPhrase(str):
    str = str.strip();
    str = re.sub('/[^A-Za-z0-9 _\-\+\&\,\#]/', '', str)
    str = str.replace('"', ' ')
    str = str.replace('\"', ' ')
    str = str.replace(')', ' ')
    str = str.replace('(', ' ')
    str = str.replace('>', ' ')
    str = str.replace('@', ' ')
    str = str.replace('<', ' ')
    str = str.replace(':', ' ')
    str = str.replace('.', ' ')
    str = str.replace('[', ' ')
    str = str.replace(']', ' ')
    str = str.replace('_', ' ')
    str = str.replace(',', ' ')
    str = str.replace('#', ' ')
    str = str.replace('-', ' ')
    str = str.replace('/', ' ')
    str = str.replace('"', ' ')
    str = str.replace('\n', ' ')
    str = str.replace('~', ' ')
    str = re.sub(r'\d+', ' ', str)
    word_sent = [word for word in str.lower().split(" ") if word not in stopwordList and len(word)>1]
    if (len(word_sent) > 1):
        finalSent = ' '.join(word_sent)
    else:
        finalSent=''

    return(finalSent.strip())





In [107]:

from spacy.en import English
nlp = English()

docs = [nlp(d) for d in cases]

'''Get the noun phrases'''
arrNounPhrase=[]
for idx, doc in enumerate(docs):
    testDoc=doc
    token_nounPhrases = [np.text for np in testDoc.noun_chunks]
    updatedNounPhrases=[]
    for tt in token_nounPhrases:
        finalSent=cleanupNounPhrase(tt)
        if(len(finalSent)>0):
            updatedNounPhrases.append(finalSent)
    arrNounPhrase.append(updatedNounPhrases)
print(len(arrNounPhrase))

'''
Flattening the arrNounPhrase
'''
flatten = lambda l: [item for sublist in l for item in sublist]
flatNounPhraseList=flatten(arrNounPhrase)
print("after flatteing noun phrases",len(flatNounPhraseList))

'''Now we need to tabulate the results'''
nounPhraseDictCount={}
import re
#now we can apply the cleanup
for tt in flatNounPhraseList:
    finalSent=tt
    if (nounPhraseDictCount.get(finalSent)):
        nounPhraseDictCount[finalSent] += 1
    else:
        nounPhraseDictCount[finalSent] = 1

print(len(nounPhraseDictCount))

'''out of all the nounphrases pick up the once which are more frequncet'''
finalPOSList={}
for dd in nounPhraseDictCount:
    ddArr=dd.split(" ")
    if(len(ddArr)>1):
        #atelaset 10 occurence of the phrase
        if(nounPhraseDictCount[dd]>9):
            #print(dd,nounPhraseDictCount[dd])
            finalPOSList[dd]=nounPhraseDictCount[dd]
            #finalPOSList.append()

print(len(finalPOSList))


11901
after flatteing noun phrases 22785
8182
probe sample 10
turb brg 11
wheelspace differential 10
rr notification 11
hydrogen pressure 10
diff press 11
positive residuals 36
lp drum 35
steam silica 14
bearing temps 68
brg vib 24
drive generator bearing temperature 11
suction temp 10
condensate pump 13
wind speed 89
wind turbines 16
seal gas filter 15
lube oil temp 40
brg mtl temp 17
turbine temperature wheelspace 16
deep cut 19
model prediction 317
sh dsh outlet temperature 10
gt load 10
bearing vib 13
motor ob bearing temp 12
duct burner spread 27
inlet temp 18
vibration issues 15
lo pressure 22
temperature issues 16
lube oil 22
lp vib 12
chip detectors 12
condition persists 20
nde bearing temperature 24
axial position sensor 15
water injection flow 20
gen bearing 11
unit start 76
ib bearing temp 16
motor winding temp 12
metal temp 91
erratic values 25
thrust bearing temperature 19
performance tag 25
fw flow 19
cylinder exhaust temperature 14
lo temperature 22
gen brg 13
nde vibrat

In [117]:
'''
need to further check 2 things 
1. Each of the individual word is a unigrams
2. The nounphrase is not already present in the bigrams or trigrams 
'''

listPOSNounPhrases=[]
for k in finalPOSList:
    listPOSNounPhrases.append(k)

finallyCheckedNounPhrases=[] 
finalQuadgramFeatures=[]
for key in listPOSNounPhrases:
    arrNgrams=key.split()
    flag=True
    length=len(arrNgrams)
    #print(key)
    for i in range(length):
        #print(i)
        if(arrNgrams[i] not in unigrams):
            flag=False
                
    if (flag==True):
        if(length==2):
            #check the presence in bigrams
            if(key not in finalBigramFeatures):
                finallyCheckedNounPhrases.append(key)
                finalBigramFeatures.append(key)
        if(length==3):
            #check the presence in trigrams
            if(key not in finalTrigramFeatures):
                finallyCheckedNounPhrases.append(key)
                finalTrigramFeatures.append(key)
        if(length==4):
            finallyCheckedNounPhrases.append(key)
            finalQuadgramFeatures.append(key)

print(len(finallyCheckedNounPhrases))
print(finallyCheckedNounPhrases)

85
['elevated vibrations', 'pressure steam tag', 'nde seal outlet flow', 'cylinder exhaust temperature', 'seal oil pressure', 'rr notification', 'vibration issues', 'turbine outlet temp', 'inlet position', 'agb chip detector', 'driver thrust bearing', 'gen slipring bearing vib', 'gt vibration transducer', 'lube oil filter dp', 'rotor vib brg', 'casing vibrations', 'mechanical model', 'de seal outlet flow', 'exhaust duct pressure', 'rear gen bearing temp', 'observed temperature', 'power turbine positions', 'nde vibration', 'air compressor', 'pump ib brg temp', 'driver load', 'turbine exhaust pressure', 'water jacket inlet pressure', 'probe sample', 'intercooler coolant pressure', 'wheelspace differential', 'generator de bearing temperature', 'compressor outlet temp range', 'fw pmp axial probe', 'gearbox oil temperature', 'current conditions', 'motor winding temperature', 'rotor speed', 'compressor speed', 'motor ob bearing temp', 'axial positions', 'intercooler outlet temperature', 'thr

In [118]:
'''Following is the length of variosu ngrams'''
print("unigram length->",len(unigrams))
print("bigrams length->",len(finalBigramFeatures))
print("trigrams length->",len(finalTrigramFeatures))
print("quadgrams length->",len(finalQuadgramFeatures))

unigram length-> 605
bigrams length-> 769
trigrams length-> 252
quadgrams length-> 21


#### We now have to normalize the ngrams using their stemmed version

The steps are as follows
1. First create a map of stem and all the words that match this stem 
e.g. 'increas': ('increase', 895, 'increasing', 676, 'increases', 313)

2. Create a word to stem mapping. 
e.g increase: increas

3. Create a map of final stem to word map that is all the words that match this stem should be replaced by this word e.g.

increas : increase

decreas: decreased

    This will use the map that was created in step 1

4. now use the maps in step 2 and 3 to find the final dictionary

word1->stem
stem->normalizedword

will give

word1->nomralized word

increasing : increase
increases : increase



In [120]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

''' 
Create a dictioanry of stem-> all words
{ 'decreas': ('decreased', 1149, 'decrease', 290, 'decreasing', 203, 'decreases', 91),...}
'''
stemmingDictBasedOnFreq={}
for k,v in sortedFreqDict:
    stemKey=stemmer.stem(k)
    if (stemKey not in stemmingDictBasedOnFreq):
        stemmingDictBasedOnFreq[stemKey]=(k,v)
    else:
        stemmingDictBasedOnFreq[stemKey]+=(k,v)

print(len(stemmingDictBasedOnFreq))

'''
Create teh stem to word mapping. For all the stems that were found earlier we need to find which word will replace it 
THis word will be selected from the list of words that map to the same stem
increas : increase

'''
stemmingDictFinal={}
for k in stemmingDictBasedOnFreq:
    #print(k)
    #take the fisrst entry from the list that is stored for each stem value
    stemmingDictFinal[k]=list(stemmingDictBasedOnFreq[k])[0]

print(len(stemmingDictFinal))

'''
Create a word to stem mapping. For each unigram it will be created
'''
#word->stem mapping
wordToStemMapping={}
for k in unigrams:
    #get the stem of the word
    stemKey=stemmer.stem(k)
    #create a map of word to its stem
    wordToStemMapping[k]=stemKey

print(len(wordToStemMapping))


479
479
605


#### Now get the normalized version of each unigram using the stems map
We will also make use of the somain information to cheange some of the unigrams like rot to rotor that cannot be done using the stem mapping

In [122]:
normalizedUnigramsDict={}
for k in wordToStemMapping:
    stem=wordToStemMapping[k]
   #print(k,stem,stemmingDictFinal[stem])
    normalizedUnigramsDict[k]=stemmingDictFinal[stem]

print(len(normalizedUnigramsDict))

#bring in the domain information as well
for k in normalizedUnigramsDict:
    for j in domainDict:
        if (k==j):
           # print(k,j,domainDict[j])
            normalizedUnigramsDict[k]=domainDict[j]

print(normalizedUnigramsDict['pmp'])

605
pump


#### Normalize bigrams and trigrams
For the bigrams and trigrams it is basically normalizing the individual parts of the ngram
Create a generic function for the same


In [123]:
'''
Takes the following
ngramListToNormalize: The bigram or trigram input list to normalize
normalizedUnigramDict: The unigram normalized dictionary containing raw->normalized unigram
N: ngrams n
'''
def createNormalizedWordDict(ngramListToNormalize,normalizedUnigramDict,N):
    normalizedNgramsDict={}
    for k in ngramListToNormalize:
        arrgrams=k.split()
        str=''
        for i in range(N):
            str+=' '+normalizedUnigramDict[arrgrams[i]]
        normalizedNgramsDict[k]=str.strip()
    return normalizedNgramsDict


In [124]:
normalizedBigramsDict=createNormalizedWordDict(finalBigramFeatures,normalizedUnigramsDict,2)
normalizedTrigramsDict=createNormalizedWordDict(finalTrigramFeatures,normalizedUnigramsDict,3)
normalizedQuadgramsDict=createNormalizedWordDict(finalQuadgramFeatures,normalizedUnigramsDict,4)

print(len(finalBigramFeatures), len(normalizedBigramsDict))
print(len(finalTrigramFeatures), len(normalizedTrigramsDict))
print(len(finalQuadgramFeatures), len(normalizedQuadgramsDict))

769 769
252 252
21 21


#### Combine the dictioaries together

In [125]:
finalNgramDict=normalizedUnigramsDict.copy()
finalNgramDict.update(normalizedBigramsDict)
finalNgramDict.update(normalizedTrigramsDict)
finalNgramDict.update(normalizedQuadgramsDict)

print(len(finalNgramDict))

1647


### Dump the dictionary to a file

#### Option1: pickle the file

In [126]:
import pickle

output = open('outputDict.txt', 'ab+')
pickle.dump(finalNgramDict, output)
output.close()

#### Option2: write to a csv file

In [127]:
import csv
with open('dict.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key in finalNgramDict:
        writer.writerow([key,finalNgramDict[key]])

In [86]:
# for k,v in finalNgramDict.items():
#     if (k=="metal temp reading"):
#         print(k,v)
