In [1]:
import nltk
import string

# used for looping through folders/files
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "/content/inputData"

## Preprocess Data

#### File information

In [3]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

['f3.txt', 'f1.txt', 'f2.txt'] 
 ['/content/inputData/f3.txt', '/content/inputData/f1.txt', '/content/inputData/f2.txt']


In [4]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)

{'/content/inputData/f3.txt': 'Anglo-French Channel Tunnel operator Eurotunnel on Monday announced a deal giving creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt mountain.\nThe long-awaited restructuring brings to an end months of wrangling between Eurotunnel and the 225 banks to which it owes nearly nine billion pounds.\nThe deal, announced simultaneously in Paris and London, brings the company back from the brink of insolvency but leaves shareholders owning only 54.5 percent of the company.\n"The restructuring plan provides Eurotunnel with the medium term financial stability to allow it to consolidate its substantial commercial achievements to date and to develop its operations," Eurotunnel co-chairman Alastair Morton said.\nThe firm was now making a profit before interest, he added.\nAlthough shareholders will see their interests diluted, they were offered the prospect of a brighter future after months of uncertainty 

## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized


#### Remove Stop words

In [6]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

#### Stemming

In [11]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

#### Remove Punctuation

In [12]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)

    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Convert terms to lowercase

In [13]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

### Test that functions are working as expected

In [14]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

Anglo-French Channel Tunnel operator Eurotunnel on Monday announced a deal giving creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt mountain.
The long-awaited restructuring brings to an end months of wrangling between Eurotunnel and th


In [15]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'on', 'Monday', 'announced', 'a', 'deal', 'giving', 'creditor', 'banks', '45.5', 'percent', 'of', 'the', 'company', 'in', 'return', 'for', 'wiping', 'out', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion']


In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'Monday', 'announced', 'deal', 'giving', 'creditor', 'banks', '45.5', 'percent', 'company', 'return', 'wiping', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion', ')', 'debt', 'mountain', '.', 'The', 'long-awaited', 'restructuring']


In [19]:
# Test stemming
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
print(content_test_stemmed[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '(', '$', '1.56', 'billion', ')', 'debt', 'mountain', '.', 'the', 'long-await', 'restructur']


In [20]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)

# visually inspect
print(content_test_cleaned[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'mountain', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl']


In [21]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'mountain', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl']


### Wrap into a function to be used by NLTK

In [22]:
# process data without writing inspection file information to file
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [56]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all(term, values, fileNames):
    print("{:<15}".format(values[i][j]), end="")

    for fileName in fileNames:
        print("{:<15}".format(fileName), end="")
    print()
    for i in range(len(term)):
        print("{:<15}".format(term[i]), end="")
        for j in range(len(fileNames)):
            print("{:<15}".format(values[i][j]), end="")
        print()


In [24]:
# write TFIDF values in 'table' format
def write_TFIDF_for_all(term, values, fileNames):
    filePath = "/content/inputData"  # specify your file path here
    with open(filePath, 'w') as file:
        file.write("{:<15}".format(""))
        for fileName in fileNames:
            file.write("{:<15}".format(fileName))
        file.write("\n")
        for i in range(len(term)):
            file.write("{:<15}".format(term[i]))
            for j in range(len(fileNames)):
                file.write("{:<15}".format(values[i][j]))
            file.write("\n")


In [52]:
# TODO: modify this to build matrix then print from matrix form
from sklearn.metrics.pairwise import cosine_similarity

def calc_and_print_CosineSimilarity_for_all(tfs, fileNames):
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")
    print("{:<15}".format(""), end="")
    for fileName in fileNames:
        print("{:<15}".format(fileName), end="")
    print()
    for i in range(len(fileNames)):
        print("{:<15}".format(fileNames[i]), end="")
        for n in range(len(fileNames)):
            numValue = (cosine_similarity([tfs[i]], [tfs[n]])[0][0])
            print("{:<15}".format(numValue), end="")
        print()
    print("\n\n=============================================================================================\n")
#calc_and_print_CosineSimilarity_for_all(tfs, fileNames)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

def calc_and_write_CosineSimilarity_for_all(tfs, fileNames):
    filePath = "cosine_similarity.txt"  # specify your file path here
    with open(filePath, 'w') as file:
        file.write("{:<15}".format(""))
        for fileName in fileNames:
            file.write("{:<15}".format(fileName))
        file.write("\n")
        for i in range(len(fileNames)):
            file.write("{:<15}".format(fileNames[i]))
            for n in range(len(fileNames)):
                numValue = (cosine_similarity([tfs[i]], [tfs[n]])[0][0])
                file.write("{:<15}".format(numValue))
            file.write("\n")


## Wrap Everything into `Main()`

In [39]:
pip install -U scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.1


In [54]:
def main(printResults=True):
    baseFolderPath = "/content/inputData"

    fileNames, filePathList = returnListOfFilePaths(baseFolderPath)

    rawContentDict = create_docContentDict(filePathList)

    # calculate tfidf
    tfidf = TfidfVectorizer(tokenizer=processData, stop_words='english')
    tfs = tfidf.fit_transform(rawContentDict.values())
    tfs_Values = tfs.toarray()
    tfs_Term = tfidf.get_feature_names_out()

    if printResults:
        # print results
        print_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_print_CosineSimilarity_for_all(tfs, fileNames)
    else:
        # write results to file
        write_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_write_CosineSimilarity_for_all(tfs, fileNames)
#calc_and_print_CosineSimilarity_for_all(tfs, fileNames)

In [36]:
main


<function __main__.main(printResults=True)>

In [55]:
main()

UnboundLocalError: ignored