# Project about 'en'
Script for converting PDF to plain text file format, and cleaning up data to remove unnecessary annotations

## Converting PDF to TXT format

In [None]:
# PDF converter
!pip install PyPDF2

In [3]:
# run this code if connecting to a Google drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import PyPDF2
import re
import os
import pandas as pd

In [5]:
def pdf_to_txt(pdf_path, output_txt):
    # Open the PDF file in read-binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Create a PdfReader object instead of PdfFileReader
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Initialize an empty string to store the text
        text = ''

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    # Write the extracted text to a text file
    with open(output_txt, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

In [6]:
# take pdf files from the corpus and convert them to txt files

corpus = '/content/drive/My Drive/en_project/corpus'

txt_dir = '/content/drive/My Drive/en_project/txt_files'

input = ''
output = ''

for f in os.scandir(corpus):
  if f.is_file():
    input = f.path
    filename = os.path.basename(f)
    filename = re.sub(r'.pdf', '.txt', filename)
    output = os.path.join(txt_dir, filename)

    pdf_to_txt(input, output)

## Cleaning up plain text files

In [32]:
# annotation conventions: https://applis.flsh.usherbrooke.ca/cfpq/index.php/site/afficher/verbal

def clean_data(old_file, new_file):

  with open(old_file, 'r', encoding='utf-8') as txt_file:

    text = txt_file.read()

    # remove carriage returns
    text = re.sub(r'\n+', ' ', text)

    # remove extraneous content
    text = re.sub(r'\(.*?\)', '', text) # removes all content in parentheses
    text = re.sub(r'\[\s*\d', '', text) # removes all overlap indicators
    text = re.sub(r';', '', text) # removes semicolons
    text = re.sub(r'(?<=[a-zA-ZÀ-ÿ:]):', '', text) # removes random colons
    text = re.sub(r'<([a-zA-Z]+)<', '', text) # removes volume and speed markers
    text = re.sub(r'<.*?>', '', text) # removes everything in angle brackets
    text = re.sub(r'[\\/{}<>\u00B0\u2022\u2191\u2193\(\)]', '', text) # removes slashes, all bracket types, bullets, arrows
    text = re.sub(r'SOUS\s*\-\s*CORPUS\s*\d* : segment \d*\.*\d*', '', text) # removes the specific SOUS-CORPUS markings every page
    text = re.sub(r'\d*', '', text) # remove any numbers that were missed

    # removes extra spaces after deleting the extra markers
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'([A-ZÀ-Ü]* :)', r'\n\1', text)

    text = text.strip() # clean up trailing whitespaces

    with open(new_file, 'w', encoding='utf-8') as txt_file:
      txt_file.write(text)

In [33]:
# take raw txt files from the corpus and clean them up
dataset = '/content/drive/My Drive/en_project/txt_files'

txt_dir = '/content/drive/My Drive/en_project/cleaned_txt_files'

input = ''
output = ''

for f in os.scandir(dataset):
  if f.is_file():
    input = f.path
    filename = os.path.basename(f)
    filename = re.sub(r'.txt', '_clean.txt', filename)
    output = os.path.join(txt_dir, filename)

    clean_data(input, output)

## Finding 'en'
Find instances of 'en' in text file and output them to a spreadsheet with the following headers:
* File name
* Text
* Information status
* Il y a en?
* Precise quantity?
* Anaphor?
* Answer to a question?
* Polarity?
* Locative?
* Comments

In [34]:
# helper function for finding instances of en
def find_en(file_path, file_name, df):
  with open(file_path, 'r', encoding='utf-8') as clean_text:

    lines = clean_text.readlines()

    for line in lines:
      if re.search(r'\ben\b', line):
        row = {
            'file_name': re.sub(r'_clean.txt', '', file_name),
            'text': line,
            'info': '',
            'exists': '',
            'quantity': '',
            'anaphor': '',
            'answer': '',
            'polarity': '',
            'locative': '',
            'comments': ''
        }

        df.loc[len(df)] = row

In [35]:
# for each file
# scan through the file for instances of 'en'
# if found, save the entire instance (anything between two speaker patterns) as text
# and extract file name as well
# remaining columns remain empty strings

annotations_df = pd.DataFrame(columns = ['file_name',
                                         'text',
                                         'info',
                                         'exists',
                                         'quantity',
                                         'anaphor',
                                         'answer',
                                         'polarity',
                                         'locative',
                                         'comments'])

cleaned = '/content/drive/My Drive/en_project/cleaned_txt_files'
file_path = ''

for f in os.scandir(cleaned):
  file_path = f.path
  file_name = os.path.basename(f)
  find_en(file_path, file_name, annotations_df)

In [None]:
annotations_df.head()

In [40]:
# rename column headers to prepare for writing to excel sheet
annotations_df.rename(columns = {'file_name': 'File name',
                                 'text': 'Clean text',
                                 'info': 'Information status',
                                 'exists': 'Il y a en?',
                                 'quantity': 'Precise quantity?',
                                 'anaphor': 'Anaphor?',
                                 'answer': 'Answer to a question?',
                                 'polarity': 'Polarity',
                                 'locative': 'Locative?',
                                 'comments': 'Comments'
                                 }, inplace=True)

In [39]:
# write to excel sheet
annotations_file = '/content/drive/My Drive/en_project/annotations.xlsx'

annotations_df.to_excel(annotations_file, sheet_name='Sheet1', index=False )