Assignment 3
Text Cleaning + Lemmatization + Stopword Removal + Label Encoding + TF-IDF + Save Output

In [11]:
# Assignment 3: Text Cleaning, Encoding, TF-IDF
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer



In [12]:

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
data = {
    "text": [
        "I love AI and Machine Learning!",
        "Python is great for data science",
        "NLP makes computers understand language"
    ],
    "label": ["positive", "positive", "neutral"]
}



In [14]:
df = pd.DataFrame(data)

In [15]:
# Text Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

In [16]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [17]:
# Stopword Removal
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['cleaned_text'].apply(
    lambda x: ' '.join([w for w in x.split() if w not in stop_words])
)

In [18]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['no_stopwords'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()])
)


In [19]:
# Label Encoding
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [20]:
# TF-IDF Representation
tfidf = TfidfVectorizer()
tfidf_features = tfidf.fit_transform(df['lemmatized'])

In [21]:
# Save Output
df.to_csv("processed_text.csv", index=False)

In [22]:
print("TF-IDF Matrix:\n", tfidf_features.toarray())
print("Saved file: processed_text.csv")

TF-IDF Matrix:
 [[0.5       0.        0.        0.        0.        0.5       0.5
  0.5       0.        0.        0.        0.        0.       ]
 [0.        0.        0.5       0.5       0.        0.        0.
  0.        0.        0.        0.5       0.5       0.       ]
 [0.        0.4472136 0.        0.        0.4472136 0.        0.
  0.        0.4472136 0.4472136 0.        0.        0.4472136]]
Saved file: processed_text.csv
