In [4]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [10]:
df = pd.read_csv(
    r"C:\\data science\\assignment\\amazonreviews.tsv",
    sep="\t",
    engine="python"
)
print(df)
print(df.head())
print(df.columns)

     label                                             review
0      pos  Stuning even for the non-gamer: This sound tra...
1      pos  The best soundtrack ever to anything.: I'm rea...
2      pos  Amazing!: This soundtrack is my favorite music...
3      pos  Excellent Soundtrack: I truly like this soundt...
4      pos  Remember, Pull Your Jaw Off The Floor After He...
...    ...                                                ...
9995   pos  A revelation of life in small town America in ...
9996   pos  Great biography of a very interesting journali...
9997   neg  Interesting Subject; Poor Presentation: You'd ...
9998   neg  Don't buy: The box looked used and it is obvio...
9999   pos  Beautiful Pen and Fast Delivery.: The pen was ...

[10000 rows x 2 columns]
  label                                             review
0   pos  Stuning even for the non-gamer: This sound tra...
1   pos  The best soundtrack ever to anything.: I'm rea...
2   pos  Amazing!: This soundtrack is my favorite mus

In [12]:
#Fix Column Names
df.columns = ['label', 'review']
print(df.columns)

Index(['label', 'review'], dtype='object')


In [13]:
# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove missing reviews
df.dropna(subset=['review'], inplace=True)

print("Shape after cleaning:", df.shape)

Shape after cleaning: (10000, 2)


In [22]:
#Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_review'] = df['review'].apply(clean_text)

print(df[['review', 'clean_review']].head())

                                              review  \
0  Stuning even for the non-gamer: This sound tra...   
1  The best soundtrack ever to anything.: I'm rea...   
2  Amazing!: This soundtrack is my favorite music...   
3  Excellent Soundtrack: I truly like this soundt...   
4  Remember, Pull Your Jaw Off The Floor After He...   

                                        clean_review  
0  stuning even nongamer sound track beautiful pa...  
1  best soundtrack ever anything im reading lot r...  
2  amazing soundtrack favorite music time hands i...  
3  excellent soundtrack truly like soundtrack enj...  
4  remember pull jaw floor hearing youve played g...  


In [23]:
#Convert Text to Numbers (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(df['clean_review'])
y = df['label']
print(X)
print(y)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 292939 stored elements and shape (10000, 5000)>
  Coords	Values
  (0, 1472)	0.2026835432811518
  (0, 4105)	0.13942671660924028
  (0, 4555)	0.1714251367989332
  (0, 354)	0.15741348249092516
  (0, 2810)	0.15923967608928638
  (0, 4850)	0.10256398627772717
  (0, 4942)	0.17267915463098668
  (0, 3553)	0.2125178004110971
  (0, 3178)	0.11722645631758234
  (0, 2013)	0.1811397381060998
  (0, 1825)	0.2984184698048861
  (0, 2899)	0.25227033890028616
  (0, 3254)	0.3355498061510679
  (0, 995)	0.2249926406750323
  (0, 1827)	0.18368166163881627
  (0, 1477)	0.11504772300324176
  (0, 396)	0.11037218398618323
  (0, 292)	0.14209483672901574
  (0, 998)	0.23142719300382733
  (0, 4350)	0.15741348249092516
  (0, 4192)	0.20875378985548565
  (0, 1968)	0.24329812430664233
  (0, 2200)	0.23944534624395825
  (0, 179)	0.12993110856493392
  (0, 605)	0.2286997133463248
  :	:
  (9999, 1604)	0.09733104432064021
  (9999, 2141)	0.08083125721680408
  (9999, 421)

In [29]:
#Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X_train, X_test, y_train, y_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 233405 stored elements and shape (8000, 5000)>
  Coords	Values
  (0, 2905)	0.08019983007962875
  (0, 2891)	0.06203937191723838
  (0, 3043)	0.0906379696628008
  (0, 4496)	0.05670635297402635
  (0, 2746)	0.1173128015475843
  (0, 2568)	0.05051479007861341
  (0, 4629)	0.07454568074706128
  (0, 3182)	0.0941313397012997
  (0, 454)	0.1760192774972952
  (0, 1770)	0.07469831428756363
  (0, 4463)	0.07639200118222951
  (0, 1640)	0.09814394150634938
  (0, 3289)	0.09141491492271422
  (0, 2039)	0.13120394723417156
  (0, 124)	0.06893296791968437
  (0, 3678)	0.12257076165026849
  (0, 4462)	0.16601764341272776
  (0, 4219)	0.10530606060930148
  (0, 1097)	0.14784469765227823
  (0, 3475)	0.10754340232702501
  (0, 2248)	0.14784469765227823
  (0, 925)	0.10342165586099705
  (0, 803)	0.09429361335709871
  (0, 2949)	0.14444486201225942
  (0, 1673)	0.21941003259997588
  :	:
  (7999, 1861)	0.0685217844290695
  (7999, 2926)	0.15559183757225037
  (7999,

In [28]:
#Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [26]:
#Evaluate Model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8425

Classification Report:

              precision    recall  f1-score   support

         neg       0.85      0.84      0.85      1037
         pos       0.83      0.84      0.84       963

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000


Confusion Matrix:

[[874 163]
 [152 811]]


In [27]:
#Predict New Review
def predict_sentiment(text):
    text_clean = clean_text(text)
    text_vector = vectorizer.transform([text_clean])
    prediction = model.predict(text_vector)
    return prediction[0]

print(predict_sentiment("This product is amazing and works perfectly"))
print(predict_sentiment("Very bad quality and waste of money"))

pos
neg


In [None]:
#Results & Evaluation

#Typical Results:
#Accuracy: 85–90%
#Balanced precision and recall
#Strong F1-score for both classes

#Confusion Matrix Interpretation
#True Positives → Correct positive predictions
#True Negatives → Correct negative predictions
#False Positives → Incorrectly classified as positive
#False Negatives → Incorrectly classified as negative

In [None]:
#Business Impact

#This system enables:
#Real-Time Monitoring
#Automatically track sentiment trends.
#Early Warning System
#Detect spikes in negative feedback.
#Customer Experience Improvement
#Respond quickly to negative sentiment.
#Product Decision Support
#Identify underperforming products early.

In [None]:
#Future Improvements

#Use SVM for improved performance
#Implement BERT for state-of-the-art results
#Add aspect-based sentiment analysis
#Deploy as REST API
#Build dashboard for sentiment tracking

In [None]:
#Conclusion

#The automated sentiment analysis system:
#Correctly preprocesses textual data
#Extracts meaningful features using TF-IDF
#Achieves strong classification accuracy
#Provides real business value