In [6]:
import pandas as pd

df = pd.read_csv("data/sample_reviews.csv")
df.head()


Unnamed: 0,label,title,content
0,positive,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,positive,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,positive,Amazing!,This soundtrack is my favorite music of all ti...
3,positive,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,positive,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [7]:
import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)                    # remove links
    text = re.sub(r"[^a-zA-Z\s]", "", text)                # remove punctuation & special chars
    text = re.sub(r"\s+", " ", text).strip().lower()       # normalize spaces & lowercase
    return text

df["clean_text"] = df["content"].apply(clean_text)
df[["content", "clean_text"]].head()


Unnamed: 0,content,clean_text
0,This sound track was beautiful! It paints the ...,this sound track was beautiful it paints the s...
1,I'm reading a lot of reviews saying that this ...,im reading a lot of reviews saying that this i...
2,This soundtrack is my favorite music of all ti...,this soundtrack is my favorite music of all ti...
3,I truly like this soundtrack and I enjoy video...,i truly like this soundtrack and i enjoy video...
4,"If you've played the game, you know how divine...",if youve played the game you know how divine t...


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = df["clean_text"]
y = df["label"]

vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.78      0.79      0.78        48
    positive       0.80      0.79      0.80        52

    accuracy                           0.79       100
   macro avg       0.79      0.79      0.79       100
weighted avg       0.79      0.79      0.79       100

