In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import inflect
import os
import time
import nltk
from nltk.corpus import stopwords
from os.path import expanduser as ospath


In [None]:
def read_and_save_text_file(year):
    
    start = time.time()
    
    path = ospath('~/final_project/C_Feature_extraction/Data_for_w2v/USPTO/')
    files = os.listdir(path)
    claims, abstracts, = ([] for i in range(2))
    for i in range(len(files)):
        if files[i].endswith(str(year) +'.xlsx'):
            print(files[i])
            dataset = pd.read_excel(files[i], 'Sheet1',  index_col  = 0)
            dataset = eliminate_na(dataset)
            abstract, claim = pre_processing_text_w2v(dataset)
            abstracts.extend(abstract)
            claims.extend(claim)
    text_doc(abstracts, claims, year)
    end = time.time()
    print('time to complete all the passages ', end - start)


def pre_processing_text_w2v(dataset):
    
    
    """It processes the text
    First, it stripes the text by the last tags and coverts each doc in lower case
    Second, it eliminates the numbers in the text and converts the verbs to their root and it eliminates the stop_words
    
     Parameters
    ----------
    input = list of the text from both claims and abstracts
    
    Returns
    -------
    ouptup = abstracts and claims pre-processed"""
    
    start = time.time()
    stop_words = set(stopwords.words('english'))
    
    tokenizer = RegexpTokenizer(r'\w+(?:[-\\]\w+)?')#keeps words with hyphen and words back slash (problem it doubles the slash)

    list_num = str(list((range(10))))
    claims, abstracts, = ([] for i in range(2))
    
    print(dataset.shape[0])

    for i in range(len(dataset)):
        print(i, end = ' ')

        #parse abstracts
        abstr = dataset["abstract"].iloc[i]
        abstr = re.sub(r'<.*?>','', str(abstr))#delete the last tags left in the text
        abstr = re.sub(r'(&#x[a-z]*[0-9]+;)','', str(abstr))
        abstr = re.sub(r'\\n','', str(abstr)).lower()
        abstr = tokenizer.tokenize(str(abstr))
        
        for ii in range(len(abstr)):# to covert num to word
            if len(abstr[ii]) != 0:
                if abstr[ii][0] in list_num or len(abstr[ii]) <=2:
                    #abstr[ii] = num_to_word(abstr[ii])
                    abstr[ii] = ''
        abstracts.append(abstr)

        #parse claims
        claim = (dataset["claims_text"].iloc[i]).lower()
        claim = claim.split("\\n\\n\\n")
        for j in range(len(claim)):
            claim[j] = claim[j].replace('\\n', ' ')
            claim[j] = re.sub(r'(&#x[a-z]*[0-9]+;)','', str(claim[j]))
            claim[j] = tokenizer.tokenize(str(claim[j]))
            claim[j] =  [x.lower() for x in claim[j]]
            claim[j] = claim[j][1 :]#eliminates claim numbers
            
            for k in range(len(claim[j])):# to covert num to word
                if len(claim[j][k]) !=0:
                    if claim[j][k][0] in list_num or len(claim[j][k]) <=2:
                        claim[j][k] = ''
                   
                    
        claims.extend(claim)
    end = time.time()
    print('time to complete the pre_processing_text_w2v ', end - start)
        
    return abstracts, claims 

    


def text_doc(abstracts, claims, year):
    
    """create a text file and write in it, where 
    each line is either an abstract or a claim"""
    
    start = time.time()
    list_text = abstracts + claims
    num_tokens = 0
    
    text_file = open(ospath('~/final_project/C_Feature_extraction/Data_for_w2v/USPTO/text_us'+ str(year) +'.txt'),'w')
    
    for i in range(len(list_text)):
        doc = ' '.join(filter(None,list_text[i]))#filter and None help to elimate white spaces in the list of strings
        num_tokens += len(doc)
        if " " in doc and len(doc) !=0:#checks if the object is not empty and it is not just a string of numbers
            text_file.write(doc + '\n')
    text_file.close()
        
    end = time.time()
    print('time to complete the text_doc function ', end - start)
    
    return print('done',num_tokens)



def eliminate_na(dataset):
    
    """delete from each Excell files patents with 
    no abstract and just 1 claim"""
    
    dataset = dataset[(dataset['number_of_claims'] != 1) & (pd.notnull(dataset["abstract"]))]
    dataset.reset_index()
    
    return dataset


    
read_and_save_text_file(2013)