In [2]:
documents = [
    "Hi, I am Shivraj. I am a third year engineering student at VIIT Pune.",
    "Natural Language Processing is an interesting subject.",
    "I am learning NLP and machine learning."
]

labels = ["student_intro", "subject_info", "learning_info"]


In [3]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

cleaned_docs = [clean_text(doc) for doc in documents]
print("Cleaned Text:")
print(cleaned_docs)


Cleaned Text:
['hi i am shivraj i am a third year engineering student at viit pune', 'natural language processing is an interesting subject', 'i am learning nlp and machine learning']


In [5]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')   # âœ… REQUIRED
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

lemmatized_docs = []
for doc in cleaned_docs:
    tokens = word_tokenize(doc)
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    lemmatized_docs.append(" ".join(lemmas))

print("\nLemmatized Text:")
print(lemmatized_docs)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Lemmatized Text:
['hi i am shivraj i am a third year engineering student at viit pune', 'natural language processing is an interesting subject', 'i am learning nlp and machine learning']


In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

final_docs = []
for doc in lemmatized_docs:
    tokens = doc.split()
    filtered = [word for word in tokens if word not in stop_words]
    final_docs.append(" ".join(filtered))

print("\nAfter Stop-word Removal:")
print(final_docs)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.



After Stop-word Removal:
['hi shivraj third year engineering student viit pune', 'natural language processing interesting subject', 'learning nlp machine learning']


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

print("\nEncoded Labels:")
print(encoded_labels)



Encoded Labels:
[1 2 0]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(final_docs)

print("\nTF-IDF Vocabulary:")
print(tfidf.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())



TF-IDF Vocabulary:
['engineering' 'hi' 'interesting' 'language' 'learning' 'machine'
 'natural' 'nlp' 'processing' 'pune' 'shivraj' 'student' 'subject' 'third'
 'viit' 'year']

TF-IDF Matrix:
[[0.35355339 0.35355339 0.         0.         0.         0.
  0.         0.         0.         0.35355339 0.35355339 0.35355339
  0.         0.35355339 0.35355339 0.35355339]
 [0.         0.         0.4472136  0.4472136  0.         0.
  0.4472136  0.         0.4472136  0.         0.         0.
  0.4472136  0.         0.         0.        ]
 [0.         0.         0.         0.         0.81649658 0.40824829
  0.         0.40824829 0.         0.         0.         0.
  0.         0.         0.         0.        ]]


In [9]:
import pandas as pd

# Save cleaned text
df_text = pd.DataFrame({
    "original_text": documents,
    "processed_text": final_docs,
    "label": encoded_labels
})
df_text.to_csv("processed_text.csv", index=False)

# Save TF-IDF matrix
df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)
df_tfidf.to_csv("tfidf_vectors.csv", index=False)

print("\nFiles saved successfully!")



Files saved successfully!
