In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Example Dataset
data = {'Review': [
    'I love this product, it is fantastic!',
    'This is the worst service I have ever had.',
    'Average experience, nothing special.',
    'Highly recommend to everyone!',
    'Terrible, would not buy again.'
]}

df = pd.DataFrame(data)

# Tokenization and Stopwords Removal
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation/numbers
    filtered = [word for word in tokens if word not in stop_words]
    return filtered

df['Cleaned_Review'] = df['Review'].apply(clean_text)

print(df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


                                       Review  \
0       I love this product, it is fantastic!   
1  This is the worst service I have ever had.   
2        Average experience, nothing special.   
3               Highly recommend to everyone!   
4              Terrible, would not buy again.   

                            Cleaned_Review  
0               [love, product, fantastic]  
1                   [worst, service, ever]  
2  [average, experience, nothing, special]  
3            [highly, recommend, everyone]  
4                   [terrible, would, buy]  


[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Create Dataset
data = {
    'Review': [
        'I absolutely loved this movie!',
        'The product quality was terrible.',
        'Not good, not bad, just okay.',
        'Fantastic service, very happy!',
        'Worst experience of my life.',
        'I am satisfied with the purchase.',
        'This is disappointing and a waste of money.',
        'Highly recommend it to everyone.',
        'Would not recommend this to anyone.',
        'Best purchase I made this year!',
        'Awful taste, never buying again.',
        'Very pleasant experience.',
        'It was a boring movie.',
        'Excellent quality and speedy delivery.',
        'Not worth the price at all.',
        'Amazing food and quick service.',
        'Bad customer service and rude staff.',
        'Superb performance and great acting.',
        'Terrible plot, I fell asleep.',
        'Delighted with the results, five stars!'
    ],
    'Sentiment': [1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
                  0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 1=Positive, 0=Negative
}

df = pd.DataFrame(data)

# Step 2: Vectorization
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Review'])
y = df['Sentiment']

# Step 3: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Model Accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%\n")

# Step 5: Manual Input Prediction
while True:
    user_input = input("Enter a review (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting...")
        break
    user_vector = vectorizer.transform([user_input])
    prediction = model.predict(user_vector)[0]
    sentiment = 'Positive 😊' if prediction == 1 else 'Negative 😞'
    print(f"Predicted Sentiment: {sentiment}\n")


Model Accuracy: 25.00%



Enter a review (or type 'exit' to quit):  not satisfied


Predicted Sentiment: Positive 😊



Enter a review (or type 'exit' to quit):  exit


Exiting...


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Dataset
data = {
    'Review': [
        'I absolutely loved this movie!',
        'The product quality was terrible.',
        'Not good, not bad, just okay.',
        'Fantastic service, very happy!',
        'Worst experience of my life.',
        'I am satisfied with the purchase.',
        'This is disappointing and a waste of money.',
        'Highly recommend it to everyone.',
        'Would not recommend this to anyone.',
        'Best purchase I made this year!',
        'Awful taste, never buying again.',
        'Very pleasant experience.',
        'It was a boring movie.',
        'Excellent quality and speedy delivery.',
        'Not worth the price at all.',
        'Amazing food and quick service.',
        'Bad customer service and rude staff.',
        'Superb performance and great acting.',
        'Terrible plot, I fell asleep.',
        'Delighted with the results, five stars!',
        'Horrible product, totally dissatisfied.',
        'Great value for money, very happy!',
        'Extremely poor experience, would not return.',
        'Loved the ambiance and the food!',
        'Disgusting behavior from the staff.',
        'Highly satisfying and pleasant stay.',
        'Complete waste of money and time.',
        'Outstanding quality and service.',
        'Hated every minute of it.',
        'Best in class, truly amazing!',
    ],
    'Sentiment': [
        1,0,0,1,0,1,0,1,0,1,
        0,1,0,1,0,1,0,1,0,1,
        0,1,0,1,0,1,0,1,0,1
    ]
}

df = pd.DataFrame(data)

# Step 2: Vectorization (TF-IDF + Bigrams)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(df['Review'])
y = df['Sentiment']

# Step 3: Train-Test Split with Stratify
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Model Training (higher iterations)
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Step 5: Accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy * 100:.2f}%\n")

# Step 6: Manual Sentiment Prediction
while True:
    user_input = input("Enter a review (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting...")
        break
    user_vector = vectorizer.transform([user_input])
    prediction = model.predict(user_vector)[0]
    sentiment = 'Positive 😊' if prediction == 1 else 'Negative 😞'
    print(f"Predicted Sentiment: {sentiment}\n")


Optimized Model Accuracy: 83.33%



Enter a review (or type 'exit' to quit):  not satisfied


Predicted Sentiment: Negative 😞



Enter a review (or type 'exit' to quit):  happy with the purchase


Predicted Sentiment: Positive 😊



Enter a review (or type 'exit' to quit):  exit


Exiting...
