<a href="https://colab.research.google.com/github/priyanka-ingale/unstructured-intelligence/blob/main/reviews_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF vector representation of reviews

In [None]:
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv('IA1_1.csv', header=None, names=['id', 'review'])
reviews = df['review'].astype(str).tolist()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
# --- Steps 1 to 3: Tokenization, Lemmatization, and Cleaning ---
processed_reviews = []
tokenized_reviews_step1 = [] # Kept for Step 5

for review in reviews:
    # Step 1: Tokenize
    tokens = nltk.word_tokenize(review.lower())
    tokenized_reviews_step1.append(tokens)

    # Step 2: Lemmatize
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]

    # Step 3: Remove stop-words and non-alphabetic tokens (replaces punctuation check)
    cleaned = [word for word in lemmatized if word not in stop_words and word.isalpha()]

    processed_reviews.append(" ".join(cleaned))

In [None]:
# --- Step 4: TF-IDF Vectors (Min DF=3, include 2-grams) ---
vectorizer4 = TfidfVectorizer(min_df=3, ngram_range=(1, 2))
tfidf_matrix4 = vectorizer4.fit_transform(processed_reviews)

In [None]:
# Convert to DataFrame and save to CSV
df_step4 = pd.DataFrame(tfidf_matrix4.toarray(), columns=vectorizer4.get_feature_names_out())
df_step4.to_csv('tfidf_vectors_step4.csv', index=False)

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
# --- Step 5: POS-tag TF-IDF Vectors (Min DF=4, no normalization/stop-word removal) ---
# Use tokens directly from Step 1
pos_tagged_reviews = []
for tokens in tokenized_reviews_step1:
    tags = nltk.pos_tag(tokens)
    # Format: word_TAG
    tagged_string = " ".join([f"{word}_{tag}" for word, tag in tags])
    pos_tagged_reviews.append(tagged_string)

In [None]:
# Vectorization without normalization or stop-word removal
vectorizer5 = TfidfVectorizer(min_df=4, norm=None, stop_words=None)
tfidf_matrix5 = vectorizer5.fit_transform(pos_tagged_reviews)

In [None]:
# Convert to DataFrame and save to CSV
df_step5 = pd.DataFrame(tfidf_matrix5.toarray(), columns=vectorizer5.get_feature_names_out())
df_step5.to_csv('pos_tfidf_vectors_step5.csv', index=False)

In [None]:
# --- Reporting Dimensions ---
print(f"Step 4 Vector Dimension: {tfidf_matrix4.shape[1]}")
print(f"Step 5 Vector Dimension: {tfidf_matrix5.shape[1]}")

Step 4 Vector Dimension: 1322
Step 5 Vector Dimension: 936
