In [3]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [5]:
data = {
    "text": [
        "I love NLP! It is amazing.",
        "NLP is used in chatbots and search engines.",
        "I do not like boring lectures.",
        "This NLP session is very interesting!"
    ]
}

df = pd.DataFrame(data)
print(df)

                                          text
0                   I love NLP! It is amazing.
1  NLP is used in chatbots and search engines.
2               I do not like boring lectures.
3        This NLP session is very interesting!


In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()                           # lowercase
    text = re.sub(r'[^a-z\s]', '', text)          # remove punctuation & numbers
    words = text.split()                          # tokenization
    words = [lemmatizer.lemmatize(word)
             for word in words if word not in stop_words]
    return " ".join(words)

In [7]:
df['clean_text'] = df['text'].apply(clean_text)
print(df)

                                          text  \
0                   I love NLP! It is amazing.   
1  NLP is used in chatbots and search engines.   
2               I do not like boring lectures.   
3        This NLP session is very interesting!   

                        clean_text  
0                 love nlp amazing  
1  nlp used chatbots search engine  
2              like boring lecture  
3          nlp session interesting  


In [8]:
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(df['clean_text'])

tfidf_df = pd.DataFrame(
    X.toarray(),
    columns=tfidf.get_feature_names_out()
)

print(tfidf_df)

    amazing   boring  chatbots   engine  interesting  lecture     like  \
0  0.644503  0.00000   0.00000  0.00000     0.000000  0.00000  0.00000   
1  0.000000  0.00000   0.47633  0.47633     0.000000  0.00000  0.00000   
2  0.000000  0.57735   0.00000  0.00000     0.000000  0.57735  0.57735   
3  0.000000  0.00000   0.00000  0.00000     0.644503  0.00000  0.00000   

       love       nlp   search   session     used  
0  0.644503  0.411378  0.00000  0.000000  0.00000  
1  0.000000  0.304035  0.47633  0.000000  0.47633  
2  0.000000  0.000000  0.00000  0.000000  0.00000  
3  0.000000  0.411378  0.00000  0.644503  0.00000  
