In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [2]:
nltk.download('punkt')
nltk.download('movie_reviews')
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Roshan
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to C:\Users\Roshan
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package twitter_samples to C:\Users\Roshan
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Roshan
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Roshan
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)


In [4]:
# Preprocessing
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    #tokens = [token for token in tokens if token not in stop_words]
    return " ".join(tokens)


In [5]:
# Feature extraction using TF-IDF
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [6]:
# Splitting the dataset
X = [preprocess_text(' '.join(words)) for words, _ in documents]
y = [category for _, category in documents]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Training a Support Vector Machine (SVM) classifier
vectorizer = TfidfVectorizer(max_features=3000)
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

clf = SVC(kernel='linear')

In [8]:
# model fitting
clf.fit(X_train_vectors, y_train)

In [9]:
# Evaluating the classifier
y_pred = clf.predict(X_test_vectors)

In [10]:
# Convert numerical labels to 'positive' or 'negative'
y_test_sentiment = ['positive' if label == 'pos' else 'negative' for label in y_test]
y_pred_sentiment = ['positive' if label == 'pos' else 'negative' for label in y_pred]

print("Accuracy:", accuracy_score(y_test_sentiment, y_pred_sentiment))
print("Classification Report:")
print(classification_report(y_test_sentiment, y_pred_sentiment))

Accuracy: 0.8475
Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85       206
    positive       0.84      0.85      0.84       194

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



In [13]:
import pickle


with open('movie_review_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)


with open('movie_review_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [14]:

with open('movie_review_classifier.pkl', 'rb') as f:
    clf = pickle.load(f)

with open('movie_review_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [15]:
def model():
  text = preprocess_text(text)
  text_vector = vectorizer.transform([text])
  prediction = clf.predict(text_vector)[0]
  return prediction



In [16]:
import streamlit as st

In [19]:
def main():
  st.title("Sentiment Classifier")
  html="""
  <div style="background-color:white; padding:10px;">
  <h1 style="color:Black; text-align:center;">Sentiment Classifier</h1>
  </div>
  """
  st.markdown(html,unsafe_allow_html=True) 
  text= st.text_input("Write a review...")
  if st.button("Submit"):
    sentiment=model(text)
    if sentiment== "pos":
      st.success("{} Sentiment".format(sentiment))
    else:
      st.success("{} Sentiment".format(sentiment))
if __name__=="__main__":
  main()
