# week 9: removing punctuation and stop words from a corpus
I provide a .zip containing .txt and .docx files​

After extracting the .zip contents, use "glob" module to get the file names in a list

For each file, remove punctuation and stop words​

Produce a single .dat file containing the name of each file in quotes, a colon, then a list of words separated by commas​. The list of words per file should be unique for that file. Do not include URLs or phone numbers. Words should be made lowercase. 

Example output:

"File 1.txt" : word1, word2, word3, word7​ "name of file.docx" : word8, word2, word1, word10​ "another file.doc" : word1, word12, word6​

In [129]:
import glob
import os
import zipfile
from collections import Counter
import re
from docx import Document
import string

In [130]:
# Path to the files
zip_path = '/Users/simranshah/Documents/Data 601 Assignments/week_10_txt_and_docx.zip'
extract_to = '/Users/simranshah/Documents/Data 601 Assignments/extracted_files'

In [141]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

In [132]:
file_paths = glob.glob(extract_to + '/**/*.*', recursive=True)
txt_docx_files = [file for file in file_paths if file.endswith(('.txt', '.docx'))]

#### Removing punctuations, stopwords and making list for the unique words

In [133]:
pip install --user -U nltk

Note: you may need to restart the kernel to use updated packages.


In [134]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simranshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simranshah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [135]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

In [136]:
results = {}

In [137]:
def process_file(file_path):
    if file_path.endswith('.txt'): # for text files
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    elif file_path.endswith('.docx'): # for docx files 
        from docx import Document
        doc = Document(file_path)
        text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])

        
    # Lowercase and tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    
    # Removing punctuation and numbers
    table = str.maketrans('', '', string.punctuation + string.digits)
    stripped = [w.translate(table) for w in tokens]
    
    # Removing non-alphabetic tokens, URLs, and phone numbers
    words = [word for word in stripped if word.isalpha() and not re.match(r'.*[\d-].*', word)]
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    
    return list(set(words))

https://stackoverflow.com/questions/47481398/os-path-basename-to-outfile

In [138]:
for file in txt_docx_files:
    unique_words = process_file(file)
    results[os.path.basename(file)] = unique_words

https://stackoverflow.com/questions/69790359/how-to-create-and-print-results-into-a-new-dat-file

In [139]:
# Generating a dat file for the output 
output_path = extract_to + '/output.dat' 
with open(output_path, 'w', encoding='utf-8') as output_file:
    for file_name, words in results.items():
        output_file.write(f'"{file_name}" : {", ".join(words)}\n')

In [140]:
print(f'Output written to {output_path}')

Output written to /Users/simranshah/Documents/Data 601 Assignments/extracted_files/output.dat
