In [1]:
import os
import sklearn
import pandas as pd
import unidecode
import csv

In [2]:
n_gram_value = 5

In [3]:
def get_ngrams(input_list, n):    
    return zip(*[input_list[i:] for i in range(n)])


In [4]:
def getAllPossibleWords(file_path): 
    possible_words=[]
    with open(file_path) as f:
        for line in f:
            for word in line.split():
                possible_words.append(word)     
    #possible_words = map(lambda x : unidecode.unidecode(x),possible_words)
    for c in ['`',"[","]","\"",".","''",","]:
        possible_words = map(lambda x: x.replace(c,""),possible_words)
        
    possible_words = map(lambda x: x.replace('&','And'),possible_words)
    #possible_words = map(lambda x: x.replace('of','Of'),possible_words)
    possible_words=filter(lambda x:len(x)>0,possible_words)    
    
    return possible_words

In [5]:
def getListFromFiles(filename):
    file_path = os.path.join(os.getcwd(),filename)
    info_list = []
    with open(file_path) as f:
        for line in f:
            for word in line.split():
                info_list.append(word)
    return info_list

In [6]:
def writeToCsv(processed_list):
    with open("output.csv",'wb') as resultFile:
        wr = csv.writer(resultFile, dialect='excel')
        #for row in processed_list:
        wr.writerows(processed_list)

In [7]:
def generateFeatures(processed_list,possible_words):
    
    company_suffixes = getListFromFiles("company-suffixes.txt")
    company_prefixes = getListFromFiles("company-prefixes.txt")    
    company_list=[]
    
    for company in processed_list:
    
        #Feature 1 : name has any prefix or suffix of a company 
        #if company name has any one of the suffixes or prefixes set it to true 
        hasCompanyid=0        
        if(company[0].split()[-1] in company_suffixes or company[0].split()[0] in company_prefixes):
            hasCompanyid = 1
            company_list.append(company[0])            
        #company.append(hasCompanyid)
        
        
        #Feature 2 : if this name is the first word of the company name seen before 
        #has Company name at the beginning ?
        hasCompanyNameFirst = 0
        l1= [x for x in company[0].split() if x not in company_prefixes]
        l2=[]
        for comp in company_list:    
            if(comp.split()[0] in company_prefixes):
                l2.append(" ".join(comp.split()[1:]))
            else:
                l2.append(comp)
        if(any(word in l2 for word in l1)):
            hasCompanyNameFirst = 1
        elif(any(company[0].split()[0]== word.split()[0] for word in company_list)):
            hasCompanyNameFirst = 1 
        
            
        #Feature 3 : If this name is a substring of the company name seen before
        #has Company name as a substring ?
        hasNameSubstring=0
        for comp in company_list:
            l1 = company[0].split()
            l2 = comp.split()
            if(len(set(l1)&set(l2)) > 0 ):
                hasNameSubstring=1       
       
        company.extend([hasCompanyid,hasCompanyNameFirst,hasNameSubstring])    

In [8]:
def preprocessing(possible_words,doc_id):    
    suffix_list = getListFromFiles("company-suffixes.txt")
    prefix_list = getListFromFiles("company-prefixes.txt")    
    common_words = getListFromFiles("common-words.txt")
    preprocessed_list =[]    
    marked_list=[]
    word_number=len(possible_words)
    for i in range(5,0,-1):
        word_count=0
        ngram_list = get_ngrams(possible_words,i)
        for ngram in ngram_list:            
            #print ngram   
            
            #if markup is both at the beginning or at the end of the word group accept and label as 1
            if(len(ngram)>1 and '<markup>' in ngram[0] and '</markup>' in ngram[len(ngram)-1] and
              all((word[0].isupper() or "markup" in word) for word in ngram) and 
              ["<markup>" in word for word in ngram].count(True)==1): 
                company_tuple = ngram
                for string in ["<markup>","</markup>"]:
                    company_tuple = map(lambda x : x.replace(string,""),company_tuple)                
                preprocessed_list.append([' '.join(company_tuple),doc_id,word_count,word_count+i-1,1])
            
            #get all instances of format <markup>Microsoft</markup>
            elif(len(ngram)==1 and '</markup>' in ngram[0] and '<markup>' in ngram[0]):
                
                company_tuple = ngram
                for string in ["<markup>","</markup>"]:
                    company_tuple = map(lambda x : x.replace(string,""),company_tuple) 
                
                preprocessed_list.append([' '.join(company_tuple),doc_id,word_count,word_count+i-1,1])            
                    
            else:
                #prune away all n grams with non uppercase first character
                if(all(word[0].isupper() and "markup" not in word for word in ngram)):  
                    
                    #prune unigrams 
                    if(i==1):                        
                        if(ngram[0] not in suffix_list):
                            if(word_number> word_count+1 and ngram[0] not in common_words):
                                #add the unigram to the list if the next word begins with lower case
                                if(possible_words[word_count+1][0].islower()):
                                    preprocessed_list.append([' '.join(ngram),doc_id,word_count,word_count+i-1,0])
                    
                    #prune common words
                    elif(any(word not in common_words for word in ngram)):                        
                        preprocessed_list.append([' '.join(ngram),doc_id,word_count,word_count+i-1,0])                   

                    
                     
            word_count = word_count+1
    return preprocessed_list    

In [9]:
base_directory = os.path.dirname(os.getcwd())
data_directory = os.path.join(base_directory,"Dataset","Rahul","Final")
file_list = os.listdir(data_directory)
possible_words=[]
processed_list=[]
count=0
for file in file_list:
    doc_id = file[0:3]
    #print(doc_id)
    #if(int(doc_id)!=1):
     #   continue    
    possible_words = getAllPossibleWords(os.path.join(data_directory,file))
    
    #start from here .. processessed list has the structure[[word,doc_id,word_count_start,word_count_end,label]]
    processed_list_for_doc = preprocessing(possible_words,doc_id)
    generateFeatures(processed_list_for_doc,possible_words)
    processed_list.extend(processed_list_for_doc)
    writeToCsv(processed_list)   
  

In [11]:
'''
preprocessed_list1=[]
ngram = ('<markup>Barclays', 'Plc</markup>')
doc_id=1
word_count=1
i=2
for string in ["<markup>","</markup>"]:
    ngram = map(lambda x : x.replace(string,""),ngram)

print(' '.join(ngram))
preprocessed_list1.append([' '.join(ngram),doc_id,word_count,word_count+i-1,1])
preprocessed_list1
'''

'\npreprocessed_list1=[]\nngram = (\'<markup>Barclays\', \'Plc</markup>\')\ndoc_id=1\nword_count=1\ni=2\nfor string in ["<markup>","</markup>"]:\n    ngram = map(lambda x : x.replace(string,""),ngram)\n\nprint(\' \'.join(ngram))\npreprocessed_list1.append([\' \'.join(ngram),doc_id,word_count,word_count+i-1,1])\npreprocessed_list1\n'