In [None]:
import numpy as np
import pandas as pd

In [None]:
temp_df = pd.read_csv("Dataset.csv")

In [None]:
# making my dataset smaller to save training time

df = temp_df.iloc[:10000]

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [None]:
df['sentiment'].value_counts()    # Data is balanced, so no inherent biasness

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,5028
negative,4972


In [None]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df.duplicated().sum()

17

In [None]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [None]:
df.duplicated().sum()

0

In [None]:
# Removing any HTML tags(usually come while scraping)

import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text


In [None]:
df.loc[:, 'review'] = df['review'].apply(remove_tags)


In [None]:
df['review'][1]

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [None]:
df.loc[:,'review'] = df['review'].apply(lambda x:x.lower())

In [None]:
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [None]:
# Removing stopwords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(text):
  new_text = []

  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)

  x = new_text.copy()
  new_text.clear()
  return " ".join(x)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df.loc[:,'review'] = df['review'].apply(remove_stopwords)

In [None]:
# making columns ready for vectorization and supervised learning

X = df['review']
y = df['sentiment']

In [None]:
X.head()

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz e...
1,wonderful little production. filming techniq...
2,thought wonderful way spend time hot s...
3,basically there's family little boy (jake) ...
4,"petter mattei's ""love time money"" visuall..."


In [None]:
y.head()

Unnamed: 0,sentiment
0,positive
1,positive
2,positive
3,negative
4,positive


In [None]:

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [None]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [None]:
# train-test split for tarining and then testing purpose
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
X_train.shape

(7986,)

In [None]:
y_train.shape

(7986,)

In [None]:
# applying tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test)

In [None]:
X_train_tfidf.shape

(7986, 48282)

In [None]:
X_test_tfidf.shape

(1997, 48282)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Logistic Regression Model
model = LogisticRegression(penalty='l2', C=0.5, solver='liblinear', max_iter=200, random_state=42)  # L2 regularaisation to avoid overfitting
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [None]:
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.88


In [None]:
print('Confusion Matrix:\n', conf_matrix)

Confusion Matrix:
 [[811 141]
 [100 945]]


In [None]:
print('Classification Report:\n', class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.85      0.87       952
           1       0.87      0.90      0.89      1045

    accuracy                           0.88      1997
   macro avg       0.88      0.88      0.88      1997
weighted avg       0.88      0.88      0.88      1997



In [None]:
review = input("Enter a review:  ")

# Convert text to TF-IDF features (Pass as a list)
X_test_example_tfidf = tfidf.transform([review])

# Predict sentiment
prediction = model.predict(X_test_example_tfidf)[0]  # Extract single prediction

if prediction == 0:
    prediction = "Negative"
else:
    prediction = "Positive"

# Print result
print(f"Predicted Sentiment: {prediction}\n")



Enter a review:  I read a book last night and it was bad.
Predicted Sentiment: Negative

