# Loading Dataset

In [22]:
import pandas as pd
with open('positive-reviews.txt', 'r') as f:
    positive_reviews = f.readlines()

with open('negative-reviews.txt', 'r') as f:
    negative_reviews = f.readlines()


# Data Preprocessing

In [23]:
from sklearn.model_selection import train_test_split

# use the top 80% of the dataset to train
split_index = int(len(positive_reviews) * 0.8)
positive_train = positive_reviews[:split_index]
positive_test = positive_reviews[split_index:]


In [24]:
train_data = pd.DataFrame({'review': positive_train + negative_train,
                           'label': [1]*len(positive_train) + [0]*len(negative_train)})

test_data = pd.DataFrame({'review': positive_test + negative_test,
                          'label': [1]*len(positive_test) + [0]*len(negative_test)})


# Feature Extraction

In [25]:
with open('positive-words.txt', 'r', encoding='latin-1') as f:
    positive_words = set(f.read().splitlines())

with open('negative-words.txt', 'r', encoding='latin-1') as f:
    negative_words = set(f.read().splitlines())

In [26]:
import numpy as np
import math
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stopword_set = set(stopwords.words('english'))

def extract_features(reviews):

    features = []
    for review in reviews:

      tokens = review.split()
      positive_count = sum(1 for word in tokens if word in positive_words)
      negative_count = sum(1 for word in tokens if word in negative_words)
      contains_no = int('no' in tokens)
      pronoun_count = sum(1 for word in tokens if word.lower() in ['I', 'me', 'my', 'you', 'your'])
      contains_exclamation = int('!' in review)
      log_length = math.log(len(tokens) + 1)

      #Aditional feature
      uppercase_count = sum(1 for word in tokens if word.isupper())
      stopword_count = sum(1 for word in tokens if word.lower())

      # Combine all features into a list
      features_vector = [positive_count, negative_count, contains_no, pronoun_count, contains_exclamation, log_length, uppercase_count, stopword_count]

      # Feature append to list
      features.append(features_vector)
    return features

# Extract Features for training and test set

train_features = extract_features(train_data['review'])
test_features = extract_features(test_data['review'])



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Create label
train_labels = train_data['label']
test_labels = test_data['label']


In [28]:
train_data.head()

Unnamed: 0,review,label
0,"Easy to use, economical!\n",1
1,Digital is where it's at...down with developin...,1
2,"Good image quality, 3x optical zoom, macro mod...",1
3,Awesome features/easy to use/fun/versatile/low...,1
4,"intuitive, user friendly\n",1


# Model Training

In [29]:
X_train = np.array(train_features)
y_train = np.array(train_data['label'])
X_test = np.array(test_features)
y_test = np.array(test_data['label'])

In [31]:
# Model 1 : Feed-Forward Neural Network
from sklearn.neural_network import MLPClassifier

model1 = MLPClassifier(alpha = 0.1 ,hidden_layer_sizes=(12,12,12), max_iter=1000, random_state=42)
model1.fit(X_train, y_train)

In [32]:
# Model 2 : Logistic Regression

from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression()
model2.fit(X_train, y_train)



In [33]:
# Model 3 : Support Vector Machine

from sklearn.svm import SVC

model3 = SVC()
model3.fit(X_train, y_train)


# Evaluate the models


In [19]:
# Evaluate each model

from sklearn.metrics import accuracy_score

y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)

accuracy1 = accuracy_score(y_test, y_pred1)
accuracy2 = accuracy_score(y_test, y_pred2)
accuracy3 = accuracy_score(y_test, y_pred3)

print("Accuracy of Model 1:", accuracy1)
print("Accuracy of Model 2:", accuracy2)
print("Accuracy of Model 3:", accuracy3)

Accuracy of Model 1: 0.710299727520436
Accuracy of Model 2: 0.7065940054495913
Accuracy of Model 3: 0.6953678474114442


In [40]:
# Predict Input Review

def predict_review_sentiment(review):

  # extract features for input review
  input_feature = extract_features([review])

  # Convert feaature to numpy array
  input_feature = np.array(input_feature)

  # Make prediction using model 1
  prediction1 = model1.predict(input_feature)

  # Map prediction to label
  if prediction1[0] == 1:
    sentiment = "Positive"
  else:
    sentiment = "Negative"

  return sentiment

input_review = "This product look exactly the same as the advertisement. I love it"
sentiment = predict_review_sentiment(input_review)

input_review2 = "This product doesn't look like the advertisement. I hate it."
sentiment2 = predict_review_sentiment(input_review2)

print("The sentiment of the review is:", sentiment)
print("The sentiment of the review is:", sentiment2)


The sentiment of the review is: Positive
The sentiment of the review is: Negative
