# Text Preprocessing and Feature Extraction using TF-**IDF**
This notebook performs the following NLP preprocessing steps:

Text Cleaning
Stopword Removal
Lemmatization
Label Encoding
TF-IDF Vectorization
Saving Outputs
You can adapt this notebook for any text dataset (CSV/Excel).

# 1. Import Required **Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import string

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ML utilities
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# 2. Load **Dataset**
The dataset should have:

A text column
A label column

In [2]:
# Example dataset (replace with your own file)
data = {
    "text": [
        "I love machine learning!",
        "Natural Language Processing is amazing.",
        "Deep learning models require large datasets.",
        "AI is transforming the world."
    ],
    "label": ["positive", "positive", "neutral", "positive"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,I love machine learning!,positive
1,Natural Language Processing is amazing.,positive
2,Deep learning models require large datasets.,neutral
3,AI is transforming the world.,positive


# 3. Text Cleaning **Function**

In [3]:
def clean_text(text):
    text = text.lower()                              # Lowercase
    text = re.sub(r'\d+', '', text)                  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()         # Remove extra spaces
    return text

df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']]

Unnamed: 0,text,clean_text
0,I love machine learning!,i love machine learning
1,Natural Language Processing is amazing.,natural language processing is amazing
2,Deep learning models require large datasets.,deep learning models require large datasets
3,AI is transforming the world.,ai is transforming the world


# 4. Stopword **Removal**

In [4]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['no_stopwords'] = df['clean_text'].apply(remove_stopwords)
df[['clean_text', 'no_stopwords']]

Unnamed: 0,clean_text,no_stopwords
0,i love machine learning,love machine learning
1,natural language processing is amazing,natural language processing amazing
2,deep learning models require large datasets,deep learning models require large datasets
3,ai is transforming the world,ai transforming world


# 5. **Lemmatization**

In [5]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df['lemmatized_text'] = df['no_stopwords'].apply(lemmatize_text)
df[['no_stopwords', 'lemmatized_text']]

Unnamed: 0,no_stopwords,lemmatized_text
0,love machine learning,love machine learning
1,natural language processing amazing,natural language processing amazing
2,deep learning models require large datasets,deep learning model require large datasets
3,ai transforming world,ai transforming world


# 6. Label **Encoding**

In [6]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

df[['label', 'label_encoded']]

Unnamed: 0,label,label_encoded
0,positive,1
1,positive,1
2,neutral,0
3,positive,1


# 7. TF-IDF **Vectorization**

In [7]:
tfidf = TfidfVectorizer(max_features=1000)

X_tfidf = tfidf.fit_transform(df['lemmatized_text'])
y = df['label_encoded']

# Convert TF-IDF to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,ai,amazing,datasets,deep,language,large,learning,love,machine,model,natural,processing,require,transforming,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.486934,0.617614,0.617614,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0
2,0.0,0.0,0.421765,0.421765,0.0,0.421765,0.332524,0.0,0.0,0.421765,0.0,0.0,0.421765,0.0,0.0
3,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


In [8]:
# Save processed dataset
df.to_csv("processed_text_data.csv", index=False)

# Save TF-IDF features
tfidf_df.to_csv("tfidf_features.csv", index=False)

# Save labels
y.to_csv("encoded_labels.csv", index=False)

print("Files saved successfully!")

Files saved successfully!
