In [1]:
!pip install nltk scikit-learn pandas



In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
data = {
    "text": [
        "Natural Language Processing is amazing!",
        "I love studying NLP and Machine Learning.",
        "Text data needs cleaning before modeling."
    ],
    "label": ["tech", "tech", "education"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,Natural Language Processing is amazing!,tech
1,I love studying NLP and Machine Learning.,tech
2,Text data needs cleaning before modeling.,education


In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)
df

Unnamed: 0,text,label,clean_text
0,Natural Language Processing is amazing!,tech,natural language processing amazing
1,I love studying NLP and Machine Learning.,tech,love studying nlp machine learning
2,Text data needs cleaning before modeling.,education,text data need cleaning modeling


In [6]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])
df

Unnamed: 0,text,label,clean_text,label_encoded
0,Natural Language Processing is amazing!,tech,natural language processing amazing,1
1,I love studying NLP and Machine Learning.,tech,love studying nlp machine learning,1
2,Text data needs cleaning before modeling.,education,text data need cleaning modeling,0


In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["clean_text"])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,amazing,cleaning,data,language,learning,love,machine,modeling,natural,need,nlp,processing,studying,text
0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0
1,0.0,0.0,0.0,0.0,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.447214,0.0
2,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.447214,0.0,0.0,0.0,0.447214


In [8]:
df.to_csv("cleaned_text_with_labels.csv", index=False)

In [9]:
tfidf_df.to_csv("tfidf_features.csv", index=False)