In [1]:
import pandas as pd
df= pd.read_csv('./datasets/USA_Housing.csv')

In [2]:
df.shape

(5000, 7)

In [3]:
df.to_csv('./datasets/Newfile.csv')

In [4]:
# pip install nltk scikit-learn

import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import download


In [5]:
# Sample text
text = """
The quick brown fox jumps over the lazy dog. The dog barked loudly at the fox ! word# to $.
"""

# 1. Text Normalization
def normalize_text(text):
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    return text

# 2. Tokenization
def tokenize_text(text):
    words = word_tokenize(text)  # Word Tokenization
    return words

# 3. Stop Words Removal
def remove_stop_words(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# 4. Stemming
def stem_words(words):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

# 5. Lemmatization
def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

In [6]:
# 6. Vectorization
def vectorize_text(text):
    vectorizer = TfidfVectorizer()  # Using TF-IDF Vectorizer
    vectors = vectorizer.fit_transform([text])
    return vectors, vectorizer.get_feature_names_out()

In [7]:
# Pre-processing pipeline
text = normalize_text(text)
words = tokenize_text(text)
words = remove_stop_words(words)
words = stem_words(words)  # You can use lemmatize_words(words) instead of stem_words(words) if preferred


In [8]:
# Vectorization
vectors, feature_names = vectorize_text(' '.join(words))

In [9]:
print(vectors)

  (0, 7)	0.2581988897471611
  (0, 1)	0.2581988897471611
  (0, 3)	0.5163977794943222
  (0, 4)	0.2581988897471611
  (0, 5)	0.2581988897471611
  (0, 2)	0.5163977794943222
  (0, 0)	0.2581988897471611
  (0, 6)	0.2581988897471611
  (0, 8)	0.2581988897471611


In [10]:
print(feature_names)

['bark' 'brown' 'dog' 'fox' 'jump' 'lazi' 'loudli' 'quick' 'word']


In [11]:
print("Normalized Text:", text)
print("Tokenized Words:", words)
print("TF-IDF Matrix:\n", vectors.toarray())
print("Feature Names:", feature_names)

Normalized Text: 
the quick brown fox jumps over the lazy dog the dog barked loudly at the fox  word to 

Tokenized Words: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'dog', 'bark', 'loudli', 'fox', 'word']
TF-IDF Matrix:
 [[0.25819889 0.25819889 0.51639778 0.51639778 0.25819889 0.25819889
  0.25819889 0.25819889 0.25819889]]
Feature Names: ['bark' 'brown' 'dog' 'fox' 'jump' 'lazi' 'loudli' 'quick' 'word']
