### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Download necessary NLTK resources (run this once)
try:
    stopwords.words('english')
    PorterStemmer()
except LookupError:
    nltk.download('stopwords')
    nltk.download('punkt')

# --- Step 1: Load the SMS Spam Collection Dataset ---
try:
    # The dataset usually has tab-separated values with label and text
    df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'text'])
    print("SMS Spam Collection dataset loaded successfully.\n")
    print("First 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: SMSSpamCollection not found. Please make sure the file is in the correct directory.")
    exit()

# --- Step 2: Text Preprocessing ---
print("\n--- Text Preprocessing ---")

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stop words and perform stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

df['processed_text'] = df['text'].apply(preprocess_text)
print("\nFirst 5 rows with processed text:")
print(df[['text', 'processed_text']].head())

# --- Step 3: Feature Extraction using TF-IDF ---
print("\n--- Feature Extraction using TF-IDF ---")

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed training text
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.2, random_state=42)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the processed testing text using the *same* fitted vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("\nShape of TF-IDF features for training data:", X_train_tfidf.shape)
print("Shape of TF-IDF features for testing data:", X_test_tfidf.shape)
print("\nTF-IDF features (sparse matrix - first sample):")
print(X_train_tfidf[0])

# To see the actual feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nNumber of unique features extracted:", len(feature_names))
# print("\nFirst 20 feature names:", feature_names[:20]) # Uncomment to see feature names

print("\nText data preprocessed and numerical features extracted using TF-IDF.")

ModuleNotFoundError: No module named 'nltk'