# DATA 601 Introduction to Data Science (03.7420) SP2020
## week 9: removing punctuation and stop words from a corpus
### Assignment Content

After extracting the .zip contents, use "glob" module to get the file names in a list

For each file, remove punctuation and stop words

Produce a single .dat file containing the name of each file in quotes, a colon, then a list of words separated by commas. The list of words per file should be unique for that file. Do not include URLs or phone numbers. Words should be made lowercase. 

Example output:

"File 1.txt" : word1, word2, word3, word7 "name of file.docx" : word8, word2, word1, word10 "another file.doc" : word1, word12, word6

In [1]:
from zipfile import ZipFile
from nltk.corpus import stopwords
import glob
import string
import docx
import re

# Zipfile() is used to extract the provided zip file
with ZipFile('week_10_txt_and_docx.zip','r') as zf:
    zf.extractall()

# Creating an empty list listed_files
listed_files = []
folder_files = ['*.txt', '*.doc*']

# Using glob module to list the extracted files
for i in folder_files:
    listed_files.extend(glob.glob(i))
listed_files

['52256-0.txt',
 '53031-0.txt',
 '58108-0.txt',
 'blind_text.txt',
 'dr_yawn.txt',
 'how_rubber_goods_are_made.txt',
 'most_boring_ever.txt',
 'most_boring_part2.txt',
 'pg12814.txt',
 'pg14895.txt',
 'pg43994.txt',
 'random_text.txt',
 'smiley_the_bunny.txt',
 'week_10_document1.docx',
 'week_10_document2.docx']

In [2]:
# stopwords is imported from ntlk.corpus
stop_words = set(stopwords.words('english'))

# Added some punctuations and missing charcters to the stop_words set, we got above
stop_words.add('------------------------------------------------------------------------')

### Find the output in generated .dat file "Removing_punctuation_corpus.dat"

In [3]:
#Initially creating dat_file object in appending mode with 'utf-8' encoding
dat_file = open('Removing_punctuation_corpus.dat','a+', encoding = 'utf8')

# Operation on each file:
for i in range(len(listed_files)):

    # For the .txt files opening in utf-8 encoding
    if listed_files[i].endswith('.txt'):
        file1 = open(listed_files[i], 'r', encoding = 'utf8')
        lines = file1.read()
    
    # docx is used to read .docx file content
    if listed_files[i].endswith('.docx'):
        file1 = docx.Document(listed_files[i])
        result = [p.text for p in file1.paragraphs]
        lines = " ".join(result)
    
    # Converting lines string to list of strings using split() function
    words = lines.split()
    
    # Removing all the URL links with https// in the string
    words = [re.sub(r'^https?:\/\/.*[\r\n]*', '', s, flags=re.MULTILINE) for s in words]
    
    # Removing numbers and differently encoded characters from the list of strings
    words = [re.sub('[0-9]', '', i) for i in words]
    words = [s.replace('”', '') for s in words]
    words = [s.replace('“', '') for s in words]
    words = [s.replace('’', '') for s in words]
    words = [s.replace('’', '') for s in words]
    words = [s.replace('?', '') for s in words]
    words = [s.replace('.', '') for s in words]
    
    # Converting all the strings in list to lower case
    lower_words = [x.lower() for x in words]
    
    # Getting unique words from the list of strings above using set() type casting.
    # set() gets unique elements from list
    my_Set = set(lower_words)
    unique_words = list(my_Set)
    
    # Removing all the punctuation from the strings if any
    # String.punctuation contains all the punctions encoded.
    unique_words_no_punct = [''.join(c for c in s if c not in string.punctuation) for s in unique_words]
    unique_words_no_punct = [i for i in unique_words_no_punct if len(i) > 2]
    
    # Removing www URLs with no http:// in string
    unique_words_noURL = [re.sub(r'^www.*', '', s, flags=re.MULTILINE) for s in unique_words_no_punct]
    
    # Writing filename with colon to the dat_file
    dat_file.write(listed_files[i])
    dat_file.write(": ")
    
    # Writing each unique word from the list we got above to the dat_file
    for j in unique_words_noURL:
        if j not in stop_words:
            dat_file.write(j)
            dat_file.write(", ")
    dat_file.write("\n")

# Closing file after operation
dat_file.close()