In [1]:
import nltk
import os
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/tecomp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
script_directory = os.getcwd()
os.chdir(script_directory)

In [3]:
all_tokens = []

In [4]:
def read_text_file(file_path): 
    with open(file_path, 'r') as f: 
        word = f.read()
        # print(nltk.word_tokenize(word))
        temp = nltk.word_tokenize(word)
        all_tokens.extend(temp)


In [5]:
for file in os.listdir():
    if file.endswith(".txt"):
        file_path = os.path.join(script_directory, file)
        read_text_file(file_path)

In [6]:
# print(all_tokens)

In [7]:
import numpy as np

In [8]:
def clean_list(tokens):
    characters_to_remove = [',', '.', ' ']
    for token in tokens:
        if token in characters_to_remove:
            tokens.remove(token)    

In [9]:
clean_list(all_tokens)

In [10]:
# print(all_tokens)

In [11]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tecomp/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
pos = nltk.pos_tag(all_tokens)

In [13]:
from nltk.stem import PorterStemmer

In [14]:
dict = {}

ps = PorterStemmer()
for w in all_tokens:
    dict[w] = ps.stem(w)

In [15]:
# print(dict)

In [16]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import nltk
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /home/tecomp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/tecomp/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [17]:
lm = WordNetLemmatizer()

In [18]:
lem_dict = {}
for w in all_tokens:
    lem_dict[w] = lm.lemmatize(w)

In [19]:
# print(lem_dict)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [21]:
documents = []
for file in os.listdir():
    if file.endswith(".txt"):
        file_path = os.path.join(script_directory, file)
        with open(file_path, 'r') as f: 
            word = f.read()
            documents.append(word)

In [22]:
# Calculating Term Frequency (TF)
count_vectorizer = CountVectorizer()
tf_matrix = count_vectorizer.fit_transform(documents)

In [23]:
# Calculating Inverse Term Frequency (ITF)
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)

In [24]:
def calculateTF(token):
    term_freq = {}
    for word in token:
        if word not in term_freq:
            term_freq[word] = token.count(word) / len(token)

    return term_freq

calculateTF(all_tokens)


{'From': 0.0057692307692307696,
 'smart': 0.0019230769230769232,
 'fabrics': 0.0038461538461538464,
 'to': 0.028846153846153848,
 'virtual': 0.0038461538461538464,
 'fashion': 0.051923076923076926,
 'shows': 0.0019230769230769232,
 'technology': 0.0038461538461538464,
 'is': 0.01730769230769231,
 'reshaping': 0.0019230769230769232,
 'the': 0.04807692307692308,
 'landscape': 0.0057692307692307696,
 'Wearable': 0.0019230769230769232,
 'tech': 0.0019230769230769232,
 'augmented': 0.0019230769230769232,
 'reality': 0.0019230769230769232,
 'and': 0.05384615384615385,
 'fitting': 0.0019230769230769232,
 'rooms': 0.0019230769230769232,
 'are': 0.009615384615384616,
 'becoming': 0.0038461538461538464,
 'integral': 0.0019230769230769232,
 'modern': 0.0019230769230769232,
 'experience': 0.0019230769230769232,
 'The': 0.015384615384615385,
 'synergy': 0.0019230769230769232,
 'between': 0.0019230769230769232,
 'opens': 0.0019230769230769232,
 'new': 0.0038461538461538464,
 'possibilities': 0.00192

In [27]:
from nltk.tokenize import word_tokenize, sent_tokenize

def calculateTF_IDF(documents):
    documents = sent_tokenize(documents)
    document_map = {}
    document_tf = {}
    unique_words = set()
    word_idf = {}
    
    for i, document in enumerate(documents):
        tokenizedWords  = nltk.word_tokenize(document)
        document_map[i] = tokenizedWords

        document_tf[i] = calculateTF(tokenizedWords)

        for word in tokenizedWords:
            unique_words.add(word)

    for word in unique_words:
        count = 0
        for _, tokenedWords in document_map.items():
            if word in tokenedWords:
                count += 1

        word_idf[word] = count

    return word_idf, document_tf
        
word_idf, document_tf = calculateTF_IDF("Sustainable fashion is gaining momentum as consumers prioritize eco-friendly choices. From ethically sourced materials to fair labor practices, the industry is adapting to meet the demand for environmentally conscious clothing. Designers are increasingly incorporating recycled fabrics and sustainable production methods, setting a new standard for ethical fashion.")
print(word_idf)

{'Designers': 1, 'environmentally': 1, 'ethically': 1, 'momentum': 1, 'meet': 1, '.': 3, 'eco-friendly': 1, 'to': 1, 'are': 1, 'increasingly': 1, 'gaining': 1, 'consumers': 1, 'for': 2, 'setting': 1, 'production': 1, 'fair': 1, 'clothing': 1, 'fabrics': 1, 'is': 2, 'recycled': 1, 'incorporating': 1, 'sourced': 1, 'sustainable': 1, 'practices': 1, 'materials': 1, 'adapting': 1, 'conscious': 1, 'labor': 1, 'new': 1, 'industry': 1, 'and': 1, 'Sustainable': 1, 'ethical': 1, 'choices': 1, 'the': 1, 'a': 1, 'prioritize': 1, 'demand': 1, 'standard': 1, 'as': 1, 'fashion': 2, 'From': 1, ',': 2, 'methods': 1}
