In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset
df = pd.read_csv('reciepe_reviews.csv')

# Drop rows with NaN values in the 'text' column
df = df.dropna(subset=['text'])

# Alternatively, you can fill NaN values with an empty string
# df['text'] = df['text'].fillna('')

# Create a binary label: 1 for positive (4 or 5 stars), 0 for negative (1, 2, or 3 stars)
df['label'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)

# Select features and labels
X = df['text']  # Using the review text as the feature
y = df['label']  # Using the binary label

# Convert text data into numerical format using CountVectorizer
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Step 3: Model Training
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

# Step 4: Make Predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.871012101210121
Confusion Matrix:
[[ 159  371]
 [  98 3008]]


In [None]:
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('reciepe_reviews.csv')

# Drop rows with NaN values in the 'text' column
df = df.dropna(subset=['text'])

# Create a binary label: 1 for positive (4 or 5 stars), 0 for negative (1, 2, or 3 stars)
df['label'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)

# Select features and labels
X = df['text']  # Using the review text as the feature
y = df['label']  # Using the binary label

# Convert text data into numerical format using CountVectorizer (manual implementation)
def text_to_vector(texts):
    from collections import Counter
    from sklearn.feature_extraction.text import CountVectorizer
    
    vectorizer = CountVectorizer()
    X_vectorized = vectorizer.fit_transform(texts).toarray()
    return X_vectorized, vectorizer.get_feature_names_out()

X_vectorized, feature_names = text_to_vector(X)

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Logistic Regression Implementation
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Gradient Descent
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_class = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_class)

# Train the model
model = LogisticRegression(learning_rate=0.01, num_iterations=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluation Metrics
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def precision(y_true, y_pred):
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    return TP / (TP + FP) if (TP + FP) > 0 else 0

def recall(y_true, y_pred):
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    return TP / (TP + FN) if (TP + FN) > 0 else 0

def f1_score(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0

# Calculate metrics
acc = accuracy(y_test, y_pred)
prec = precision(y_test, y_pred)
rec = recall(y_test, y_pred)
f1 = f