In [12]:
# sample dataset

texts = [
    "I loved the movie!",
    "The movie was terrible"
]

labels = ["positive", 'negative']

Cleaning + Lemmatization

In [13]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^a-z\s]', '', text)
  tokens = word_tokenize(text)
  tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
  return " ".join(tokens)

cleaned_texts = [clean_text(t) for t in texts]
print(cleaned_texts)

['loved movie', 'movie terrible']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Label Encoding

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_labels = le.fit_transform(labels)
print(encoded_labels)

[1 0]


TF-IDF Representation

In [15]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(cleaned_texts)

print(tfidf.get_feature_names_out())
print(x.toarray())

['loved' 'movie' 'terrible']
[[0.81480247 0.57973867 0.        ]
 [0.         0.57973867 0.81480247]]


Save Outputs

In [17]:
import pandas as pd

df = pd.DataFrame(x.toarray(), columns = tfidf.get_feature_names_out())
df["label"] = encoded_labels

df.to_csv("processed_text_data.csv", index = False)