In [None]:
## perform text cleaning ,  perform lemmatization (any method), remove stop words (any method), label encoding. Create representations using TF-IDF. Save outputs

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK data (ensure it's available for lemmatization and stopwords)
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Sample data (replace this with your own dataset)
data = {
    'text': [
        "Cats are running in the garden.",
        "The dog is barking loudly!",
        "I love programming in Python.",
        "Python is an amazing language.",
        "Dogs and cats can be great pets."
    ],
    'label': ['animal', 'animal', 'coding', 'coding', 'animal']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Lemmatization and stop word removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['processed_text'] = df['cleaned_text'].apply(preprocess_text)

# Label encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# TF-IDF vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['processed_text'])

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()
)

# Save outputs to CSV files
df.to_csv('processed_texts.csv', index=False)
tfidf_df.to_csv('tfidf_features.csv', index=False)

# Display results
print("Processed DataFrame:")
print(df)
print("\nTF-IDF Features:")
print(tfidf_df)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Processed DataFrame:
                               text   label                     cleaned_text  \
0   Cats are running in the garden.  animal   cats are running in the garden   
1        The dog is barking loudly!  animal        the dog is barking loudly   
2     I love programming in Python.  coding     i love programming in python   
3    Python is an amazing language.  coding    python is an amazing language   
4  Dogs and cats can be great pets.  animal  dogs and cats can be great pets   

            processed_text  label_encoded  
0       cat running garden              0  
1       dog barking loudly              0  
2  love programming python              1  
3  python amazing language              1  
4        dog cat great pet              0  

TF-IDF Features:
    amazing   barking       cat       dog    garden     great  language  \
0  0.000000  0.000000  0.495524  0.000000  0.614189  0.000000  0.000000   
1  0.000000  0.614189  0.000000  0.495524  0.000000  0.000000  0.0