# Sentiment predictions using TF-IDF

We will first import packages, download dataset, conduct TF-IDF, and at the end use logistic regression. 

In [None]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [3]:
# Load the IMDB dataset
imdb_dataset = load_dataset("imdb")

# Extract the text reviews and their labels
train_reviews = imdb_dataset['train']['text']
train_labels = imdb_dataset['train']['label']

In [4]:
# Use TfidfVectorizer to convert text data into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer on the training reviews and transform the reviews into TF-IDF vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(train_reviews)

In [None]:
# Convert the sparse matrix to a DataFrame (Optional)
tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the first few rows of the TF-IDF matrix
print(tfidf_df.head())

Here we can see that each word is presented as the percentage and not frequecy as it was in BoW. 

In [None]:
# Train a classifier using the TF-IDF feature vectors
# Split the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, train_labels, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)

# Train the model
clf.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")


As we can see, the model achieves a validation accuracy of 0.88 which is quite good considering the simplicity of this model. It improved a bit compared to the BoW. 