In [1]:
pip install nltk scikit-learn pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\attar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\attar/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\attar/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Sample dataset
data = {
    'text': [
        "Dogs are running in the park!!!",
        "He studies at the university, and he's doing well.",
        "What's the best way to learn Natural Language Processing?",
        "The quick brown fox jumps over the lazy dog."
    ],
    'label': ['animal', 'student', 'education', 'animal']
}

df = pd.DataFrame(data)

# 1. TEXT CLEANING
def clean_text(text):
    text = text.lower()                      # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)     # Remove punctuation/numbers
    return text

df['cleaned'] = df['text'].apply(clean_text)

# 2. TOKENIZATION & STOP WORD REMOVAL
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df['processed'] = df['cleaned'].apply(preprocess)

# 3. LABEL ENCODING
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# 4. TF-IDF VECTORIZATION
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# 5. COMBINE AND SAVE
final_df = pd.concat([df[['text', 'processed', 'label', 'label_encoded']], tfidf_df], axis=1)
final_df.to_csv("processed_text_data.csv", index=False)

print("✅ Process completed. File saved as 'processed_text_data.csv'.")
print("\nSample Output:")
print(final_df.head())

✅ Process completed. File saved as 'processed_text_data.csv'.

Sample Output:
                                                text  \
0                    Dogs are running in the park!!!   
1  He studies at the university, and he's doing w...   
2  What's the best way to learn Natural Language ...   
3       The quick brown fox jumps over the lazy dog.   

                                          processed      label  label_encoded  \
0                                  dog running park     animal              0   
1                          study university he well    student              2   
2  whats best way learn natural language processing  education              1   
3                     quick brown fox jump lazy dog     animal              0   

       best     brown       dog       fox   he      jump  ...   natural  \
0  0.000000  0.000000  0.486934  0.000000  0.0  0.000000  ...  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.5  0.000000  ...  0.000000   
2  0.37796