In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import string

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
file_path = "data.txt"
lines = []
with open(file_path, "r") as file:
    for line in file:
        lines.append(line.strip())

In [3]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
corpus = " ".join(lines)
words = nltk.word_tokenize(corpus)
words = [word.strip(string.punctuation) for word in words]
stop_words.update(set(string.punctuation))
stop_words.update(["'s", ""])

In [9]:
filtered_words = [lemmatizer.lemmatize(word).lower() for word in words if word.lower() not in stop_words]

In [10]:
freqs = Counter(filtered_words)
sorted_word_freq = sorted(freqs.items(), key=lambda x: x[1], reverse=True)

file_path = "frequency.txt"
with open(file_path, "w") as file:
    for word, freq in sorted_word_freq:
        file.write(f"{word}: {freq}\n")

In [7]:
vectorizer = TfidfVectorizer(max_df=0.90, max_features=1500, stop_words='english')
matrix = vectorizer.fit_transform(lines)
tfidf_sum = np.sum(matrix, axis=0)
terms = vectorizer.get_feature_names_out()
term_dict = {term: score for term, score in zip(terms, tfidf_sum.tolist()[0])}
sorted_term_dict = sorted(term_dict.items(), key=lambda x: x[1], reverse=True)
file_path = "tf-idf.txt"
with open(file_path, "w") as file:
    for term, score in sorted_term_dict:
        file.write(f"{term}: {score}\n")