Practical No: 3

Perform text cleaning, perform lemmatization (any
method), remove stop words (any method), label encoding.
Create representations using TF-IDF. 

In [1]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

Sample Data

In [2]:
data = {
    'text': ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?"],
    'label': ['A', 'B', 'C', 'A']
}

In [3]:
df = pd.DataFrame(data)

Text Cleaning, Lemmatization, and Stop Words Removal

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [5]:
def clean_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [token for token in tokens if token.isalnum()]  # Remove non-alphanumeric characters
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(clean_text)

Label Encoding

In [6]:
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

TF-IDF Representation

In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

Save Outputs

In [8]:
df[['clean_text', 'label', 'encoded_label']].to_csv('cleaned_data.csv', index=False)
tfidf_df.to_csv('tfidf_representation.csv', index=False)

print("Outputs saved successfully.")

Outputs saved successfully.


In [9]:
file_path = "cleaned_data.csv"
df1 = pd.read_csv(file_path)

# Display the first few rows
print(df1.head())

                 clean_text label  encoded_label
0            first document     A              0
1  document second document     B              1
2                 third one     C              2
3            first document     A              0


In [10]:
file_path = "tfidf_representation.csv"
df2 = pd.read_csv(file_path)

# Display the first few rows
print(df2.head())

   document     first       one    second     third
0  0.629228  0.777221  0.000000  0.000000  0.000000
1  0.787223  0.000000  0.000000  0.616668  0.000000
2  0.000000  0.000000  0.707107  0.000000  0.707107
3  0.629228  0.777221  0.000000  0.000000  0.000000
