In [1]:
pip install pandas scikit-learn nltk



In [6]:
import pandas as pd

data = pd.DataFrame({
    "text": [
        "I love machine learning and artificial intelligence",
        "This product is extremely bad and disappointing",
        "Natural language processing is very interesting",
        "I hate spam messages and fake calls",
        "Deep learning models perform well on large data",
        "The service quality was terrible and slow",
        "AI is transforming healthcare and education",
        "This app crashes frequently and is useless",
        "Data science is an exciting career option",
        "The customer support experience was awful",
        "Machine learning improves decision making",
        "The software has too many bugs",
        "Artificial intelligence helps solve complex problems",
        "I am unhappy with the poor performance",
        "Technology advancements are beneficial to society",
        "The interface design is confusing and frustrating"
    ],
    "label": [
        "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative"
    ]
})

data.to_csv("dataset.csv", index=False)
print("✅ dataset.csv created successfully!")


✅ dataset.csv created successfully!


In [7]:
import pandas as pd
data = pd.read_csv("dataset.csv")
print(data.head())

                                                text     label
0  I love machine learning and artificial intelli...  positive
1    This product is extremely bad and disappointing  negative
2    Natural language processing is very interesting  positive
3                I hate spam messages and fake calls  negative
4    Deep learning models perform well on large data  positive


In [2]:
# -------------------------------
# TEXT PREPROCESSING + TF-IDF
# -------------------------------

import pandas as pd
import re
import nltk
import pickle

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# -------------------------------
# Download NLTK resources
# -------------------------------
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
# -------------------------------
# Load Dataset
# -------------------------------
# CSV must have columns: "text" and "label"
data = pd.read_csv("dataset.csv")

In [9]:
# -------------------------------
# Text Cleaning Function
# -------------------------------
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # remove numbers & punctuation
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [10]:
# -------------------------------
# Apply Preprocessing
# -------------------------------
data["clean_text"] = data["text"].apply(preprocess_text)

In [11]:
# -------------------------------
# Label Encoding
# -------------------------------
label_encoder = LabelEncoder()
data["label_encoded"] = label_encoder.fit_transform(data["label"])

In [12]:
# -------------------------------
# TF-IDF Vectorization
# -------------------------------
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data["clean_text"])

In [13]:
# Save cleaned dataset
data.to_csv("cleaned_dataset.csv", index=False)

In [14]:
# Save TF-IDF matrix
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.to_csv("tfidf_features.csv", index=False)

In [15]:
# Save label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

In [16]:
# Save TF-IDF model
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [17]:
print("✅ Text processing and TF-IDF feature extraction completed successfully!")

✅ Text processing and TF-IDF feature extraction completed successfully!
