### Satyajeet Tukaram Chavan
### Roll no-381025
### PRN no-22310630
### Batch-A1

Perform text cleaning, perform lemmatization (any method), remove stop words (any method),
label encoding. Create representations using TF-IDF. Save outputs

In [None]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample Dataset
# -------------------------------
data = {
"text": [
"Natural Language Processing is amazing!",
"Machine learning helps computers understand language.",
"TF-IDF and NLP are important techniques.",
"Stop words should be removed from text",
"Lemmatization converts words into base form"
],
"label": ["tech", "tech", "tech", "preprocess", "preprocess"]
}

In [None]:
# Create DataFrame
df = pd.DataFrame(data)
print("Original Dataset:\n", df)


# -------------------------------
# 1. Text Cleaning
# -------------------------------
def clean_text(text):
 text = text.lower() # Convert to lowercase
 text = re.sub(r'[^a-z\s]', '', text) # Remove numbers and punctuation
 text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
 return text

Original Dataset:
                                                 text       label
0            Natural Language Processing is amazing!        tech
1  Machine learning helps computers understand la...        tech
2           TF-IDF and NLP are important techniques.        tech
3             Stop words should be removed from text  preprocess
4        Lemmatization converts words into base form  preprocess


In [None]:
# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)
print("\nAfter Text Cleaning:\n", df['clean_text'])


# -------------------------------
# 2. Stopword Removal
# -------------------------------
stop_words = set(stopwords.words('english'))


# Tokenize and remove stopwords
def remove_stopwords(text):
 words = nltk.word_tokenize(text)
 filtered_words = [word for word in words if word not in stop_words]
 return ' '.join(filtered_words)


# Apply stopword removal
df['no_stopwords'] = df['clean_text'].apply(remove_stopwords)
print("\nAfter Stopword Removal:\n", df['no_stopwords'])


# --------------------------


After Text Cleaning:
 0               natural language processing is amazing
1    machine learning helps computers understand la...
2               tfidf and nlp are important techniques
3               stop words should be removed from text
4          lemmatization converts words into base form
Name: clean_text, dtype: object

After Stopword Removal:
 0                  natural language processing amazing
1    machine learning helps computers understand la...
2                       tfidf nlp important techniques
3                              stop words removed text
4               lemmatization converts words base form
Name: no_stopwords, dtype: object


In [None]:
# 3. Lemmatization
# -------------------------------
lemmatizer = WordNetLemmatizer()


def lemmatize_text(text):
 words = nltk.word_tokenize(text)
 lemmas = [lemmatizer.lemmatize(word) for word in words]
 return ' '.join(lemmas)


# Apply lemmatization
df['lemmatized_text'] = df['no_stopwords'].apply(lemmatize_text)
print("\nAfter Lemmatization:\n", df['lemmatized_text'])


After Lemmatization:
 0                  natural language processing amazing
1    machine learning help computer understand lang...
2                        tfidf nlp important technique
3                               stop word removed text
4                 lemmatization convert word base form
Name: lemmatized_text, dtype: object


In [None]:
# 4. Label Encoding
# -------------------------------
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])


print("\nLabel Encoding:")
for label, encoded in zip(df['label'], df['encoded_label']):
    print(f"{label} -> {encoded}")


Label Encoding:
tech -> 1
tech -> 1
tech -> 1
preprocess -> 0
preprocess -> 0


In [None]:
# 5. TF-IDF Representation
# -------------------------------
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['lemmatized_text'])


# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n", tfidf_df)


TF-IDF Matrix:
     amazing      base  computer   convert      form      help  important  \
0  0.523358  0.000000  0.000000  0.000000  0.000000  0.000000        0.0   
1  0.000000  0.000000  0.420669  0.000000  0.000000  0.420669        0.0   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000        0.5   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000        0.0   
4  0.000000  0.463693  0.000000  0.463693  0.463693  0.000000        0.0   

   language  learning  lemmatization  ...   natural  nlp  processing  \
0  0.422242  0.000000       0.000000  ...  0.523358  0.0    0.523358   
1  0.339393  0.420669       0.000000  ...  0.000000  0.0    0.000000   
2  0.000000  0.000000       0.000000  ...  0.000000  0.5    0.000000   
3  0.000000  0.000000       0.000000  ...  0.000000  0.0    0.000000   
4  0.000000  0.000000       0.463693  ...  0.000000  0.0    0.000000   

    removed      stop  technique      text  tfidf  understand      word  
0  0.000000  0.0000

In [None]:
# 6. Save Outputs to Files
# -------------------------------
# Save cleaned and processed dataset
df.to_csv("processed_text_data.csv", index=False)


# Save TF-IDF features
tfidf_df.to_csv("tfidf_features.csv", index=False)


print("\nFiles saved successfully:")
print("1. processed_text_data.csv")
print("2. tfidf_features.csv")


Files saved successfully:
1. processed_text_data.csv
2. tfidf_features.csv
