In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [3]:
text = "Natural Language Processing is the subfield of linguistic, computer science and artificial intelligence concerned with the interaction between human and computer"

In [4]:
import re
filtered_text = re.sub(r'[^\w\s]','',text)
print(filtered_text)

Natural Language Processing is the subfield of linguistic computer science and artificial intelligence concerned with the interaction between human and computer


In [5]:
tokens = word_tokenize(filtered_text)
print(tokens)

['Natural', 'Language', 'Processing', 'is', 'the', 'subfield', 'of', 'linguistic', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interaction', 'between', 'human', 'and', 'computer']


In [6]:
stop_words = set(stopwords.words("english"))
filtered_tokens = [x for x in tokens if x.lower() not in stop_words]
print(filtered_tokens)

['Natural', 'Language', 'Processing', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interaction', 'human', 'computer']


In [7]:
pos_tagged = pos_tag(filtered_tokens)
print(pos_tagged)

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('subfield', 'VBD'), ('linguistic', 'JJ'), ('computer', 'NN'), ('science', 'NN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('concerned', 'VBN'), ('interaction', 'NN'), ('human', 'JJ'), ('computer', 'NN')]


In [8]:
p = PorterStemmer()
stem = [p.stem(x) for x in filtered_tokens]
print(stem)

['natur', 'languag', 'process', 'subfield', 'linguist', 'comput', 'scienc', 'artifici', 'intellig', 'concern', 'interact', 'human', 'comput']


In [10]:
l = WordNetLemmatizer()
lemmatized_tokens = [l.lemmatize(x) for x in filtered_tokens]
print(lemmatized_tokens)

['Natural', 'Language', 'Processing', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interaction', 'human', 'computer']


In [63]:
def pos(x):
    if x.startswith('V'):
        return 'v'
    elif x.startswith('R'):
        return 'r'
    elif x.startswith('J'):
        return 'a'
    else:
        return 'n'

In [64]:
l = WordNetLemmatizer()
lemma = [l.lemmatize(x,pos(w)) for x,w in pos_tagged]
print(lemma)

['Natural', 'Language', 'Processing', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concern', 'interaction', 'human', 'computer']


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
data = " ".join(lemma)

tv = TfidfVectorizer()
tfidf_matrix = tv.fit_transform([data])

In [66]:
features = tv.get_feature_names()

tfidf_score = dict(zip(features, tfidf_matrix.toarray()[0]))

for word, score in tfidf_score.items():
    print(f"{word} --->  {score}")

artificial --->  0.2581988897471611
computer --->  0.5163977794943222
concern --->  0.2581988897471611
human --->  0.2581988897471611
intelligence --->  0.2581988897471611
interaction --->  0.2581988897471611
language --->  0.2581988897471611
linguistic --->  0.2581988897471611
natural --->  0.2581988897471611
processing --->  0.2581988897471611
science --->  0.2581988897471611
subfield --->  0.2581988897471611


In [12]:
from collections import Counter
import math

# TF calculation
tf = Counter(lemmatized_tokens)
total_words = len(lemmatized_tokens)
tf = {word: tf[word]/total_words for word in tf}

# IDF calculation
def calculate_idf(docs, term):
    return math.log(len(docs) / (sum(1 for doc in docs if term in doc) + 1))

idf = {word: calculate_idf([lemmatized_tokens], word) for word in set(lemmatized_tokens)}

# Calculate TF-IDF
tfidf = {word: tf[word] * idf[word] for word in tf}

print("TF-IDF Scores:")
for word, score in sorted(tfidf.items(), key=lambda x: x[1], reverse=True):
    print(f"{word}: {score}")


TF-IDF Scores:
Natural: -0.053319013889226566
Language: -0.053319013889226566
Processing: -0.053319013889226566
subfield: -0.053319013889226566
linguistic: -0.053319013889226566
science: -0.053319013889226566
artificial: -0.053319013889226566
intelligence: -0.053319013889226566
concerned: -0.053319013889226566
interaction: -0.053319013889226566
human: -0.053319013889226566
computer: -0.10663802777845313
