In [1]:
# importing the libraries
import nltk
from nltk.corpus import stopwords
import re
import os

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# unzip the dataset
!unzip '/content/CSE508_Winter2023_Dataset.zip'

Archive:  /content/CSE508_Winter2023_Dataset.zip
  inflating: CSE508_Winter2023_Dataset/cranfield0001  
  inflating: CSE508_Winter2023_Dataset/cranfield0002  
  inflating: CSE508_Winter2023_Dataset/cranfield0003  
  inflating: CSE508_Winter2023_Dataset/cranfield0004  
  inflating: CSE508_Winter2023_Dataset/cranfield0005  
  inflating: CSE508_Winter2023_Dataset/cranfield0006  
  inflating: CSE508_Winter2023_Dataset/cranfield0007  
  inflating: CSE508_Winter2023_Dataset/cranfield0008  
  inflating: CSE508_Winter2023_Dataset/cranfield0009  
  inflating: CSE508_Winter2023_Dataset/cranfield0010  
  inflating: CSE508_Winter2023_Dataset/cranfield0011  
  inflating: CSE508_Winter2023_Dataset/cranfield0012  
  inflating: CSE508_Winter2023_Dataset/cranfield0013  
  inflating: CSE508_Winter2023_Dataset/cranfield0014  
  inflating: CSE508_Winter2023_Dataset/cranfield0015  
  inflating: CSE508_Winter2023_Dataset/cranfield0016  
  inflating: CSE508_Winter2023_Dataset/cranfield0017  
  inflating: CSE

# **(i) Relevant Text Extraction**

In [3]:
DIRECTORY_PATH = '/content/CSE508_Winter2023_Dataset'

In [4]:

# method to extract the content between the given tag
def find_tag_data(data, tag):
  """
    Input: 
      data - string
      tag - given tag for which we need to extract the text.
    Output: string 
  """
  reg_str = "<" + tag + ">(.*?)</" + tag + ">"
  extracted_text = re.findall(reg_str, data)
  return extracted_text


# method to read a file, and extract the content between title and text tag
def read_file(path):
  """
    Input:
      path - file path
    Output: (title, text)
  """
  f = open(path, 'r')
  file_data = f.read().replace('\n', ' ')

  title = find_tag_data(file_data, 'TITLE')
  text = find_tag_data(file_data, 'TEXT')

  f.close()
  return title, text


# utility method to print first five files from the given directory path
def print_first_five_files(directory):
  files = sorted(os.listdir(directory))[:5]
  for file in files:
    path = directory + '/' + file
    with open(path, 'r') as f:
      print(f.read())

    print('-'*20 + ' NEXT FILE ' + '-'*20)


# method to override the file at given path with a given content, 
def update_file(path, data):
  with open(path, 'w') as file:
    file.write(data)

  return True

In [5]:
# method to find relevant title and text from all the files
# and updating each file with the relevant data. 
def extract_relevant_data(directory):
  print("Printing the First five files before processing. \n")
  print_first_five_files(directory)

  update_count = 0
  for file in os.listdir(directory):
    path = directory + '/' + file

    title, text = read_file(path)
    final_text = title[0] + ' ' + text[0]

    if update_file(path, final_text):
      update_count += 1

  print("\nPrinting the First five files after processing. \n")
  print_first_five_files(directory)

  print("\nFiles Updated: ", update_count)

In [6]:
extract_relevant_data(DIRECTORY_PATH)

Printing the First five files before processing. 

<DOC>
<DOCNO>
1
</DOCNO>
<TITLE>
experimental investigation of the aerodynamics of a
wing in a slipstream .
</TITLE>
<AUTHOR>
brenckman,m.
</AUTHOR>
<BIBLIO>
j. ae. scs. 25, 1958, 324.
</BIBLIO>
<TEXT>
  an experimental study of a wing in a propeller slipstream was
made in order to determine the spanwise distribution of the lift
increase due to slipstream at different angles of attack of the wing
and at different free stream to slipstream velocity ratios .  the
results were intended in part as an evaluation basis for different
theoretical treatments of this problem .
  the comparative span loading curves, together with supporting
evidence, showed that a substantial part of the lift increment
produced by the slipstream was due to a /destalling/ or boundary-layer-control
effect .  the integrated remaining lift increment,
after subtracting this destalling lift, was found to agree
well with a potential flow theory .
  an empirical evaluati

# **(ii) Preprocessing**

***a) Lowercase the text***

In [7]:
# utility method to lowercase the content of a file with given
# path and update the file with same content in lower case.
def to_lowercase_and_update(path):
  """
    reading the file, and storing 
    its content in lowercase.
    Input: path of the file
    Output: None
  """
  with open(path, 'r') as f:
    data = f.read().lower()

  # updating the same with lowercase data
  with open(path, 'w') as f:
    f.write(data)

  return True

***b) Perform tokenization***

In [8]:
# utility method to tokenize the content of a file with given
# path and update the file with the tokens.
def tokenize_and_update(path):
  with open(path, 'r') as f:
    data = f.read()
    tokens = nltk.word_tokenize(data)

  with open(path, 'w') as f:
    for token in tokens:
      f.write("'" + token + "'" + " ")

  return True

***c) Remove stopwords***

In [9]:
# utility method to remove stopwords from a file with the given path.
def remove_stopwords_and_update(path):
  f = open(path, 'r')
  data = f.read().replace("'", "")
  without_stopwords = []

  for word in data.split():
    if word not in stop_words:
      without_stopwords.append(word)

  f.close()
  
  with open(path, 'w') as f:
    for word in without_stopwords:
      f.write("'" + word + "'" + " ")

  return True

***d) Remove punctuations***

In [10]:
# utility method to remove punctuations from a file with the given path.
def remove_punctuations_and_update(path):
  f = open(path, 'r')
  data = f.read().replace("'", "")
  removed_punctuations = re.sub(r'[^\w\s]', '', data)

  f.close()
  with open(path, 'w') as f:
    for word in removed_punctuations.split():
      f.write("'" + word + "'" + " ")

  return True

***e) Remove blank space tokens***

In [11]:
# utility method to remove blank tokens from a file with the given path.
def remove_blank_tokens_and_update(path):
  f = open(path, 'r')
  data = f.read().replace("'", "")
  processed_data = " ".join(data.split())

  f.close()
  with open(path, 'w') as f:
    f.write(processed_data)

  return True

In [15]:
# method to process the directory files with a given operation
def process_files(directory, operation):
  print("Printing the First five files before processing. \n")
  print_first_five_files(directory)

  update_count = 0
  for file in os.listdir(directory):
    path = directory + '/' + file

    if os.path.isfile(path):
      if operation == 'to_lowercase':
        to_lowercase_and_update(path)

      elif operation == 'tokenize':
        tokenize_and_update(path)

      elif operation == 'remove_stopwords':
        remove_stopwords_and_update(path)

      elif operation == 'remove_punctuations':
        remove_punctuations_and_update(path)

      elif operation == 'remove_blank_tokens':
        remove_blank_tokens_and_update(path)
    else:
      pass

    update_count += 1

  print("\nPrinting the First five files after processing. \n")
  print_first_five_files(directory)

  print("\nFiles Updated: ", update_count)

In [16]:
# main
if __name__ == '__main__':
  process_files(DIRECTORY_PATH, 'to_lowercase')
  process_files(DIRECTORY_PATH, 'tokenize')
  process_files(DIRECTORY_PATH, 'remove_stopwords')
  process_files(DIRECTORY_PATH, 'remove_punctuations')
  process_files(DIRECTORY_PATH, 'remove_blank_tokens')

Printing the First five files before processing. 

 experimental investigation of the aerodynamics of a wing in a slipstream .     an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios .  the results were intended in part as an evaluation basis for different theoretical treatments of this problem .   the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary-layer-control effect .  the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory .   an empirical evaluation of the destalling effects was made for the specific configuration of the experiment . 
-------------------- NEXT FILE ---