In [None]:
# Cleaner version 1.1
# Date - 10/3/2023

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import os
import glob



all_files_text = []

# Grabbing each article for each group ex. all the articles in group 1. 
# Just have to change the folder name for each group.
files_path = [file for file in glob.glob(r'articles\*_*.txt') if os.path.isfile(file)]



# Find what file is having decoding issues
# If there is a url, check to see if the character is important
# Otherwise we'll just use replace to get rid of the character
# for file in files_path:
#    try:
#            all_files_text.append(f.read())
#    except UnicodeDecodeError:
#        print(f"Error decoding file: {file}")

# Read each file and append the text to a list
for file in files_path:
    with open(file, 'r', encoding='utf-8', errors='replace') as f:
       all_files_text.append(f.read())

all_files_text

In [None]:
# Git rid of garabage characters
text = [x.replace('\n', ' ') for x in all_files_text]
text = [x.replace('\t', ' ') for x in text]

# Set the text to lower case
text = [x.lower() for x in text]
text


In [None]:
# tokenize text
text = [word_tokenize(x) for x in text]

# tag the text
tagged_text = [pos_tag(x) for x in text]

tagged_text

In [None]:
# Setup
lemmatizer = WordNetLemmatizer()
lemmatized_text = []


# Update the part of speech tags to be compatible with the lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return wordnet.NOUN
    
for sentence in tagged_text:
    lemmatized_text.append([lemmatizer.lemmatize(word[0], get_wordnet_pos(word[1])) for word in sentence])

lemmatized_text

In [None]:
# create the stop list array
stops = list(set(stopwords.words('english'))) + [',', '.' , '-', 'however', 'ever' , 'also', '?' , '#', '@' ,'(', ')', "'s", "n't" , '``', "''", 
 "--", "cnn" , "scrap" , "image"]

# get rid of stop words
text_no_stops = [[word for word in sentence if word not in stops] for sentence in lemmatized_text]

# get rid of punctuation
text_no_stops = [[word for word in sentence if word not in string.punctuation] for sentence in text_no_stops]

# get rid of works that contain numbers
text_no_stops = [[word for word in sentence if not any(char.isdigit() for char in word)] for sentence in text_no_stops]

# get rid of single character words
text_no_stops = [[word for word in sentence if len(word) > 1] for sentence in text_no_stops]

# get rid of contractions
text_no_stops = [[word for word in sentence if "'" not in word] for sentence in text_no_stops]

# get rid of .com
text_no_stops = [[word for word in sentence if ".com" not in word] for sentence in text_no_stops]

# get rid of non-printable characters
text_no_stops = [[word for word in sentence if word.isprintable()] for sentence in text_no_stops]

# get rid of emojis by looking for non-ascii characters
text_no_stops = [[word for word in sentence if word.encode('ascii', 'ignore').decode('ascii') == word] for sentence in text_no_stops]

# get rid of words that contain - / or . or ^
text_no_stops = [[word for word in sentence if '-' not in word] for sentence in text_no_stops]
text_no_stops = [[word for word in sentence if '/' not in word] for sentence in text_no_stops]
text_no_stops = [[word for word in sentence if '.' not in word] for sentence in text_no_stops]
text_no_stops = [[word for word in sentence if '^' not in word] for sentence in text_no_stops]

# get rid of duplicate words in each sentence
text_no_stops = [list(set(sentence)) for sentence in text_no_stops]

text_no_stops

In [None]:
# Array for the different topic names to add to the file name
topics = ['sports', 'food', 'tech', 'science', 'business', 'politics']

for i in range(len(text_no_stops)):
    if i + 1 < 10:
        if i + 1 == 5:
            with open('cleaned_050{}_'.format(i + 1)+(topics[2])+'.txt', 'w') as f:
             f.write(' '.join(text_no_stops[i]))
        else:
            with open('cleaned_050{}_'.format(i + 1)+(topics[3])+'.txt', 'w') as f:
             f.write(' '.join(text_no_stops[i]))
    else:
        if i + 1 == 10 or i + 1 == 15 or i + 1 == 20:
            with open('cleaned_05{}_'.format(i + 1)+(topics[2])+'.txt', 'w') as f:
             f.write(' '.join(text_no_stops[i]))
        else:
            with open('cleaned_05{}_'.format(i + 1)+(topics[3])+'.txt', 'w') as f:
             f.write(' '.join(text_no_stops[i]))
    

def save_file_professor(text_no_stops):
    for i in range(len(text_no_stops)):
        if i + 1 == 1:
           with open('cleaned_990{}_'.format(i + 1)+(topics[0])+'.txt', 'w') as f:
                f.write(' '.join(text_no_stops[i]))
        elif i + 1 == 2:
           with open('cleaned_990{}_'.format(i + 1)+(topics[1])+'.txt', 'w') as f:
                f.write(' '.join(text_no_stops[i]))
        elif i + 1 == 3:
            with open('cleaned_990{}_'.format(i + 1)+(topics[2])+'.txt', 'w') as f:
                f.write(' '.join(text_no_stops[i]))
        elif i + 1 == 4:
            with open('cleaned_990{}_'.format(i + 1)+(topics[3])+'.txt', 'w') as f:
                f.write(' '.join(text_no_stops[i]))
        elif i + 1 == 5:
            with open('cleaned_990{}_'.format(i + 1)+(topics[4])+'.txt', 'w') as f:
                f.write(' '.join(text_no_stops[i]))
        elif i + 1 == 6:
            with open('cleaned_990{}_'.format(i + 1)+(topics[5])+'.txt', 'w') as f:
                f.write(' '.join(text_no_stops[i]))