# Project about 'en'
Script for converting PDF to plain text file format, and cleaning up data to remove unnecessary annotations

## Converting PDF to TXT format

In [None]:
# PDF converter
!pip install PyPDF2

In [None]:
# run this code if connecting to a Google drive
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import PyPDF2
import re
import os
import pandas as pd

In [None]:
def pdf_to_txt(pdf_path, output_txt):
    # Open the PDF file in read-binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Create a PdfReader object instead of PdfFileReader
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Initialize an empty string to store the text
        text = ''

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    # Write the extracted text to a text file
    with open(output_txt, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

In [None]:
# take pdf files from the corpus and convert them to txt files

corpus = '/content/drive/My Drive/en_project/corpus'

txt_dir = '/content/drive/My Drive/en_project/txt_files'

input = ''
output = ''

for f in os.scandir(corpus):
  if f.is_file():
    input = f.path
    filename = os.path.basename(f)
    filename = re.sub(r'.pdf', '.txt', filename)
    output = os.path.join(txt_dir, filename)

    pdf_to_txt(input, output)

## Cleaning up plain text files

In [None]:
# annotation conventions: https://applis.flsh.usherbrooke.ca/cfpq/index.php/site/afficher/verbal

def clean_data(old_file, new_file):

  with open(old_file, 'r', encoding='utf-8') as txt_file:

    text = txt_file.read()

    # remove carriage returns
    text = re.sub(r'\n+', ' ', text)

    # remove extraneous content
    text = re.sub(r'\(.*?\)', '', text) # removes all content in parentheses
    text = re.sub(r'\<P\d*,L\d*\>', '', text) # removes all <P#,L#> markers
    text = re.sub(r'\[\s*\d', '', text) # removes all overlap indicators
    text = re.sub(r'(?<=[a-zA-Z}]);(?=[a-zA-Z}])', '', text) # removes random semicolons
    text = re.sub(r'(?<=[a-zA-ZÀ-ÿ:]):', '', text) # removes random colons
    text = re.sub(r'<([a-zA-Z]+)<', '', text) # removes volume and speed markers
    text = re.sub(r'\u00A4/s*<d*>', '', text) # removes the weird <number> markers
    text = re.sub(r'[\\/{}<>\u00B0\u2022\u2191\u2193]', '', text) # removes slashes, all bracket types, bullets, arrows

    # removes extra spaces after deleting the extra markers
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'([A-ZÀ-Ü] :)', r'\n\1', text)

    text = text.strip() # clean up trailing whitespaces

    with open(new_file, 'w', encoding='utf-8') as txt_file:
      txt_file.write(text)

In [None]:
# take raw txt files from the corpus and clean them up
dataset = '/content/drive/My Drive/en_project/txt_files'

txt_dir = '/content/drive/My Drive/en_project/cleaned_txt_files'

input = ''
output = ''

for f in os.scandir(dataset):
  if f.is_file():
    input = f.path
    filename = os.path.basename(f)
    filename = re.sub(r'.txt', '_clean.txt', filename)
    output = os.path.join(txt_dir, filename)

    clean_data(input, output)

## Finding 'en'
Find instances of 'en' in text file and output them to a spreadsheet with the following headers:
* File name
* Text
* Information status
* Il y a en?
* Precise quantity?
* Anaphor?
* Answer to a question?
* Polarity?
* Locative?
* Comments

In [None]:
# for each file
# scan through the file for instances of 'en'
# if found, save the entire instance (anything between two speaker patterns) as text
# and extract file name as well
# remaining columns remain empty strings

annotations_df = pd.DataFrame(columns = ['file_name',
                                         'text',
                                         'info',
                                         'exists',
                                         'quantity',
                                         'anaphor',
                                         'answer',
                                         'polarity',
                                         'locative',
                                         'comments'])

file_path = '/content/clean_test.txt'

with open(file_path, 'r', encoding='utf-8') as clean_text:

  lines = clean_text.readlines()

  for line in lines:
    if re.search(r'\ben\b', line):
      row = {
          'file_name': file_path,
          'text': line,
          'info': '',
          'exists': '',
          'quantity': '',
          'anaphor': '',
          'answer': '',
          'polarity': '',
          'locative': '',
          'comments': ''
      }

      annotations_df.loc[len(annotations_df)] = row

In [None]:
annotations_df.head()

Unnamed: 0,file_name,text,info,exists,quantity,anaphor,answer,polarity,locative,comments
0,/content/clean_test.txt,G : ben qu’est -ce qui arrive si tu en as deux...,,,,,,,,
1,/content/clean_test.txt,A : c- c’était PLEIN il doit y en avoir qui on...,,,,,,,,
2,/content/clean_test.txt,G : euh ben il y a un peu toutes sortes d’affa...,,,,,,,,
3,/content/clean_test.txt,G : mais là t’as des t’ en avais soiXANTE que ...,,,,,,,,
4,/content/clean_test.txt,M : OUI moi j’ en prends deux euh m’asde même \n,,,,,,,,
