In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import inflect
import os
import time
from nltk.corpus import stopwords
from os.path import expanduser as ospath



def open_and_save_text_file():
    
    path = ospath('~/code_final_project/A_Data_Collection/EPO/Sample_Bulk_full_text_EPO/')
    files = os.listdir(path)
    #opens all the files in the directory
    for file in files:
        with open(ospath('~/code_final_project/A_Data_Collection/EPO/Sample_Bulk_full_text_EPO/' + file), encoding="utf8") as f:
            data = f.readlines()
            name = file[2:-4]
            list_text  = select_english_text(data)
            abstracts, claims = pre_processing_text_w2v(list_text)
            text_doc(abstracts, claims, name)

def select_english_text(data):
    
    """from English patents selects only abstracts and claims 
    
    input = data text from the XML text file
    output = a list of both claims and abstracts"""
    
    count = 0
    list_text = []
    for patent in data:
        patent = re.sub('\s+',' ',str(patent))#strip the text from tab an new line symbols
        language, types, text, type_A_B, pat_num = patent[25:27], patent[28:33], patent[34:], patent[11:13], patent[3:10]
        if (language == 'en' and types != 'TITLE' and 
            types != 'DESCR' and types != 'PDFEP' and 
            types != 'SRPRT' and types != 'AMEND'):
            list_text.append([pat_num, type_A_B, language, types, text])# pat_num,kind, language, type and text
    
    return list_text

   

def pre_processing_text_w2v(list_text):
    
    """It processes the text
    First, it stripes the text by the last tags and coverts each doc in lower case
    Second, it eliminates the numbers in the text and converts the verbs to their root and it eliminates the stop_words
    
     Parameters
    ----------
    input = list of the text from both claims and abstracts
    
    Returns
    -------
    ouptup = abstractss and claims pre-processed"""
    
    start = time.time()
    
    stop_words = set(stopwords.words('english'))
    
    word_lemmatizer = WordNetLemmatizer()
    
    tokenizer = RegexpTokenizer(r'\w+(?:[-\\]\w+)?')#keeps words with hyphen and words back slash

    list_num = str(list((range(10))))
    
    abstracts, claims, = ([] for i in range(2))

    
    for i in range(len(list_text)):
        
        #parses abstracts
        if list_text[i][3] == 'ABSTR':
            abstr = list_text[i][4:]
            abstr = re.sub(r'(<.*?>)|(-->)', " ", str(abstr)).lower()#clean the text
            abstr = tokenizer.tokenize(str(abstr))
            
            #at word level
            for ii in range(len(abstr)):# to covert num to word
                if len(abstr[ii]) != 0:
                    if abstr[ii][0] in list_num or abstr[ii] in stop_words:
                        abstr[ii] = ''
                    else:
                        abstr[ii] = word_lemmatizer.lemmatize(abstr[ii], pos="v")
            abstracts.append(abstr)

        #parses claims
        if list_text[i][3] == 'CLAIM':#makes sure to take the claim from type Bs
            temp_claim = []
            cl = list_text[i][4:]
            
            list_num_claims = re.findall(r'num="([0-9]+)"', str(cl))#finds tags with claim num
        
            lenght_list_num_claims = len(list_num_claims) -1
            
            #claims at patent level
            for i in range(len(list_num_claims)):
                if lenght_list_num_claims >=1:
                    claim = re.findall(r''+ list_num_claims[i]+'(.*?)'+ list_num_claims[i+1], str(cl))
                    temp_claim.append(claim)
                    lenght_list_num_claims -=1
                else:
                    claim = re.findall(r''+ list_num_claims[i]+'(.*?)$', str(cl))
                    temp_claim.append(claim)
                    lenght_list_num_claims -=1
            
            #at claim level
            for j in range(len(temp_claim)):
                claim = re.sub(r'<sub>(.*?)</sub>',"", str(temp_claim[j]))#subscript
                claim = re.sub(r'<sup>(.*?)</sup>',"", str(claim))#subscript
                claim = re.sub(r'(<.*?>)|(-->)|(\\)|\bnum\b', " ", str(claim))
                claim = re.sub(r'^\s*?[0-9]+.\s*', "", str(claim)).lower()
                claim = tokenizer.tokenize(str(claim))
                temp_claim[j] = claim
                
                #at word level
                for ii in range(len(temp_claim[j])):
                    if len(temp_claim[j][ii]) != 0:
                        if temp_claim[j][ii][0] in list_num or temp_claim[j][ii] in stop_words :
                            temp_claim[j][ii] = ''
                        else:
                            temp_claim[j][ii] = word_lemmatizer.lemmatize(temp_claim[j][ii], pos="v")
            claims.extend(temp_claim)
                
                
    end = time.time()
    print('time to complete the pre_processing_text_w2v', end - start)
        
    return abstracts, claims 


def text_doc(abstracts, claims, name):
    
    """create a text file and write in it, where 
    each line is either an abstract or a claim"""
    
    start = time.time()
    list_text = abstracts + claims
    
    #text_file=open('text_no_num_and_stop_word_epo.txt','w')
    
    with  open(ospath('~/code_final_project/C_Feature_extraction/Data_for_w2v/text_epo'+ name + '.txt'), 'w',  encoding="utf-8") as text_file: 
 
        for i in range(len(list_text)):
            doc = ' '.join(filter(None,list_text[i]))#filter and None help to elimate white spaces in the list of strings
            if " " in doc and len(doc) != 0:#checks if the object is not empty and it is not just a string of numbers
                text_file.write(doc + '\n')
        
    end = time.time()
    print('time to complete the text_doc function ', end - start)
    
    return print('done')




    
open_and_save_text_file()