### Code for identifying pages that contain keywords, finding the questions, extracting and translating them 

In [6]:
from collections import defaultdict
import pdfplumber
import pandas as pd
from deep_translator import GoogleTranslator
import PyPDF2
import re
import operator 
import numpy as np

def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list




#function to test different regex expressions
#so findall definitely works for matching aceented characters 
#problem is with the pdf extractor that we are using
def test_regex():
    matched = (re.findall('é','hé'))
    print("matched",matched)



#translator function - given an array of lines, translate each line in the array, add to array of translated lines,
#and return 
def translator(lines):
    translated_array  = []
    for i in lines:
        to_translate = i 
        translated  = GoogleTranslator(source='auto',target='en').translate(i)
        translated_array.append(translated)
    return translated_array 






#cleans text from any whitesace and can later be used to remove punctuation if necessary 
def clean(text):
    text = re.sub('\n','',str(text))
    text = re.sub('\n',' ',str(text))
    #removing punctuation 
    #text = re.sub(r'[^\w\s]','',text)
    return text 


#contains the kwyrods for each poverty question and translates them into the target language - change later so that 
#you can change the language we are using 
def keywords():
    questions_to_keywords={
    "holiday":"Can your whole household afford to go for a week’s annual holiday, away from home?",
    "vegetarian":"Can your household afford a meal with meat, chicken, fish(or vegetarian equivalent)?",
    "expense": "Can your household afford an unexpected required expense(amount to be filled) and pay through its own resources?",
    "telephone":"Does your household have a telephone(fixed landline or mobile)?",
    "colour TV":"Does your household have a color TV?",
    "washing machine":"Does the household have a washing machine? ",
    "van":"Does your household have a car/van for private use? ",
    "dwelling":"Do you have any of the following problems with your dwelling/accommodation? ",
    "warm":"Can your household afford to keep its home adequately warm?  "
    } 
    translated_keywords_dict = defaultdict()
    for key in questions_to_keywords.keys():
        translated_keywords_dict[GoogleTranslator(source='en', target='french').translate(key)] = []
    return translated_keywords_dict



    
#iterates through an array which contains page numbers, extracts each quesiton from that page, translates them into english,
#adds to an array, cleans data and adds to final array 
def translate_document(pages):
    pdf1 = pdfplumber.open("france.pdf")
    translated_array = []
    #pages = list(pages)
    # writing page 161 will translate page 162
    pages= pages
    for number in pages:
        p1 = pdf1.pages[number]
        im = p1.to_image()
        text = p1.extract_text()
        text = clean(text)
        #text = re.split('[?]',text)  
        text = re.findall('(?<=[\?\.\!]\s)[^\?\n\.]+?\?',text)
        clean_sent  = []
        for sent in text:
            clean_sent.append(sent) 
        #translated_array.append(clean_sent)
        translated_array.append(translator(clean_sent))
    return translated_array

 

def main():
    translated_keywords = keywords().keys()
    pages = []
    pdf = pdfplumber.open("france.pdf")
    for word in translated_keywords:
        word = word.lower()
        for i in range(0,len(pdf.pages)):
            page_number = pdf.pages[i]
            Text = page_number.extract_text()
            if re.findall(word,Text,re.IGNORECASE):
                pages.append(i)
    print(pages)
    clean_translations = flatten_list(translate_document(pages))
    #d = {'Translated Questions':translate_document(pages)}
    d = {'Translated Questions':clean_translations}
    DftranslatedDoc=pd.DataFrame(data =d)
    display(DftranslatedDoc) 
    DftranslatedDoc.to_csv('out_translation.csv',index=False) 


main()

        
    

[6, 8, 14, 18, 29, 31, 32, 41, 43, 45, 48, 55, 57, 58, 184, 188, 254, 274, 334, 336, 183, 186, 151, 185, 254, 258, 259, 261, 262, 3, 4, 12, 34, 107, 108, 109, 191, 332, 106, 107, 2, 4, 5, 6, 7, 14, 17, 18, 23, 25, 28, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 52, 55, 56, 57, 58, 63, 64, 69, 70, 71, 72, 73, 74, 77, 83, 84, 85, 86, 87, 88, 90, 91, 97, 98, 99, 105, 117, 124, 125, 126, 127, 128, 130, 131, 132, 134, 153, 157, 164, 165, 170, 173, 177, 179, 180, 185, 189, 194, 196, 197, 198, 199, 200, 201, 203, 206, 212, 218, 227, 228, 232, 241, 251, 252, 254, 255, 257, 258, 259, 260, 261, 262, 263, 270, 271, 272, 273, 274, 275, 279, 281, 283, 287, 288, 290, 291, 292, 294, 297, 298, 300, 302, 303, 307, 309, 311, 314, 321, 322, 323, 325, 326, 327, 330, 331, 337, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 6

Unnamed: 0,Translated Questions
0,The address sheet was created by mistake If RE...
1,(Answer in clear)……………………………… Go to VALIDF If ...
2,Unusual and uncertain until the end of data co...
3,"» if BS > 0 (Y14) According to you, is the acc..."
4,A leisure or holiday residence?
...,...
731,"If SITIND = (B), (C), (D) or (SITIND = (A) and..."
732,"No NSP REFUSAL If EXRMI=1 and if CAISSALAR =1,..."
733,Has this benefit been paid?
734,To a person outside the current household DK R...


### Beginning of NLP Code


In [None]:
import pdfplumber
import pandas as pd
from deep_translator import GoogleTranslator
import PyPDF2
import re
import operator
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
    
pdf = pdfplumber.open("france.pdf")

nltk.download('stopwords')
stop_words= stopwords.words('english') # can be changed to ('french')


from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()



def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list




#function to test different regex expressions
#so findall definitely works for matching aceented characters 
#problem is with the pdf extractor that we are using
def test_regex():
    matched = (re.findall('é','hé'))
    print("matched",matched)



#translator function - given an array of lines, translate each line in the array, add to array of translated lines,
#and return 
def translator(lines):
    translated_array  = []
    for i in lines:
        to_translate = i 
        translated  = GoogleTranslator(source='auto',target='en').translate(i)
        translated_array.append(translated)
    return translated_array 






#cleans text from any whitesace and can later be used to remove punctuation if necessary 
def clean(text):
    text = re.sub('\n','',str(text))
    text = re.sub('\n',' ',str(text))
    #removing punctuation 
    #text = re.sub(r'[^\w\s]','',text)
    return text 


#contains the kwyrods for each poverty question and translates them into the target language - change later so that 
#you can change the language we are using 
def keywords():
    questions_to_keywords={
    "holiday":"Can your whole household afford to go for a week’s annual holiday, away from home?",
    "vegetarian":"Can your household afford a meal with meat, chicken, fish(or vegetarian equivalent)?",
    "expense": "Can your household afford an unexpected required expense(amount to be filled) and pay through its own resources?",
    "telephone":"Does your household have a telephone(fixed landline or mobile)?",
    "colour TV":"Does your household have a color TV?",
    "washing machine":"Does the household have a washing machine? ",
    "van":"Does your household have a car/van for private use? ",
    "dwelling":"Do you have any of the following problems with your dwelling/accommodation? ",
    "warm":"Can your household afford to keep its home adequately warm?  "
    } 
    translated_keywords_dict = defaultdict()
    for key in questions_to_keywords.keys():
        translated_keywords_dict[GoogleTranslator(source='en', target='french').translate(key)] = []
    return translated_keywords_dict



    
#iterates through an array which contains page numbers, extracts each quesiton from that page, translates them into english,
#adds to an array, cleans data and adds to final array 
def translate_document(pages):
    pdf1 = pdfplumber.open("france.pdf")
    translated_array = []
    #pages = list(pages)
    # writing page 161 will translate page 162
    pages= pages
    for number in pages:
        p1 = pdf1.pages[number]
        im = p1.to_image()
        text = p1.extract_text()
        text = clean(text)
        #text = re.split('[?]',text)  
        text = re.findall('(?<=[\?\.\!]\s)[^\?\n\.]+?\?',text)
        clean_sent  = []
        for sent in text:
            clean_sent.append(sent) 
        #translated_array.append(clean_sent)
        translated_array.append(translator(clean_sent))
    return translated_array

 

def main():
    translated_keywords = keywords().keys()
    pages = []
    pdf = pdfplumber.open("france.pdf")
    for word in translated_keywords:
        word = word.lower()
        for i in range(0,len(pdf.pages)):
            page_number = pdf.pages[i]
            Text = page_number.extract_text()
            if re.findall(word,Text,re.IGNORECASE):
                pages.append(i)
    print(pages)
    clean_translations = flatten_list(translate_document(pages))
    #d = {'Translated Questions':translate_document(pages)}
    d = {'Translated Questions':clean_translations}
    DftranslatedDoc=pd.DataFrame(data =d)
    display(DftranslatedDoc) 
    DftranslatedDoc.to_csv('out_translation.csv',index=False) 


main()


lemmatizier= WordNetLemmatizer()
for index,row in DftranslatedDoc.iterows():
    filttered_sentence=[]
    sentence = row['col1']
    sentence = re.sub(r'[^\w\s]','',sentence)
    words=nltk.word_tokenize(sentence)
    words=[ w for rw in words if not w in stop_words]
    for word in words:
        filttered_sentence.append(lemmatizer.lemmatize(word))
    #print(filttered_sentence)
    data.ix[index,'col1'] = filttered_sentence
        

      
    

### after finding questions on relevant pages, translates only the qusetions that contain the keywords we're looking for 

In [1]:
from collections import defaultdict
import pdfplumber
import pandas as pd
from deep_translator import GoogleTranslator
import PyPDF2
import re
import operator 
import numpy as np

def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list




#function to test different regex expressions
#so findall definitely works for matching aceented characters 
#problem is with the pdf extractor that we are using
def test_regex():
    matched = (re.findall('é','hé'))
    print("matched",matched)



#translator function - given an array of lines, translate each line in the array, add to array of translated lines,
#and return 
def translator(lines):
    translated_array  = []
    for i in lines:
        to_translate = i 
        translated  = GoogleTranslator(source='auto',target='en').translate(i)
        translated_array.append(translated)
    return translated_array 






#cleans text from any whitesace and can later be used to remove punctuation if necessary 
def clean(text):
    text = re.sub('\n','',str(text))
    text = re.sub('\n',' ',str(text))
    #removing punctuation 
    #text = re.sub(r'[^\w\s]','',text)
    return text 


#contains the kwyrods for each poverty question and translates them into the target language - change later so that 
#you can change the language we are using 
def keywords():
    questions_to_keywords={
    "holiday":"Can your whole household afford to go for a week’s annual holiday, away from home?",
    "vegetarian":"Can your household afford a meal with meat, chicken, fish(or vegetarian equivalent)?",
    "expense": "Can your household afford an unexpected required expense(amount to be filled) and pay through its own resources?",
    "telephone":"Does your household have a telephone(fixed landline or mobile)?",
    "colour TV":"Does your household have a color TV?",
    "washing machine":"Does the household have a washing machine? ",
    "van":"Does your household have a car/van for private use? ",
    "dwelling":"Do you have any of the following problems with your dwelling/accommodation? ",
    "warm":"Can your household afford to keep its home adequately warm?  "
    } 
    translated_keywords_dict = defaultdict()
    for key in questions_to_keywords.keys():
        translated_keywords_dict[GoogleTranslator(source='en', target='french').translate(key)] = []
    return translated_keywords_dict



    
#iterates through an array which contains page numbers, extracts each quesiton from that page, translates them into english,
#adds to an array, cleans data and adds to final array 
def translate_document(pages):
    pdf1 = pdfplumber.open("france.pdf")
    translated_array = []
    #pages = list(pages)
    # writing page 161 will translate page 162
    pages= pages
    keyword_list = keywords().keys()
    for number in pages:
        p1 = pdf1.pages[number]
        im = p1.to_image()
        text = p1.extract_text()
        text = clean(text)
        #text = re.split('[?]',text)  
        text = re.findall('(?<=[\?\.\!]\s)[^\?\n\.]+?\?',text)
        for sentence in text:
            y = any(x in sentence for x in keyword_list)
            if not y:
                text.remove(sentence)
        clean_sent  = []
        for sent in text:
            clean_sent.append(sent) 
        #translated_array.append(clean_sent)
        translated_array.append(translator(clean_sent))
    return translated_array




def main():
    translated_keywords = keywords().keys()
    pages = []
    pdf = pdfplumber.open("france.pdf")
    for word in translated_keywords:
        word = word.lower()
        for i in range(0,len(pdf.pages)):
            page_number = pdf.pages[i]
            Text = page_number.extract_text()
            if re.findall(word,Text,re.IGNORECASE):
                pages.append(i)
    print(pages)
    clean_translations = flatten_list(translate_document(pages))
    #d = {'Translated Questions':translate_document(pages)}
    d = {'Translated Questions':clean_translations}
    DftranslatedDoc=pd.DataFrame(data =d)
    display(DftranslatedDoc) 
    DftranslatedDoc.to_csv('out_translation.csv',index=False) 

main()



[6, 8, 14, 18, 29, 31, 32, 41, 43, 45, 48, 55, 57, 58, 184, 188, 254, 274, 334, 336, 183, 186, 151, 185, 254, 258, 259, 261, 262, 3, 4, 12, 34, 107, 108, 109, 191, 332, 106, 107, 2, 4, 5, 6, 7, 14, 17, 18, 23, 25, 28, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 52, 55, 56, 57, 58, 63, 64, 69, 70, 71, 72, 73, 74, 77, 83, 84, 85, 86, 87, 88, 90, 91, 97, 98, 99, 105, 117, 124, 125, 126, 127, 128, 130, 131, 132, 134, 153, 157, 164, 165, 170, 173, 177, 179, 180, 185, 189, 194, 196, 197, 198, 199, 200, 201, 203, 206, 212, 218, 227, 228, 232, 241, 251, 252, 254, 255, 257, 258, 259, 260, 261, 262, 263, 270, 271, 272, 273, 274, 275, 279, 281, 283, 287, 288, 290, 291, 292, 294, 297, 298, 300, 302, 303, 307, 309, 311, 314, 321, 322, 323, 325, 326, 327, 330, 331, 337, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 6

Unnamed: 0,Translated Questions
0,(Answer in clear)……………………………… Go to VALIDF If ...
1,"» if BS > 0 (Y14) According to you, is the acc..."
2,A leisure or holiday residence?
3,Occasional accommodation for studies or work?
4,Otherwise: TYPMEN Occupation of the same accom...
...,...
418,Other benefits in kind (free or preferential r...
419,"» If AVNAT=5, NSP or REFUSAL, go to FINSAL If ..."
420,"If SITIND = (B), (C), (D) or (SITIND = (A) and..."
421,"No NSP REFUSAL If EXRMI=1 and if CAISSALAR =1,..."
