<a href="https://colab.research.google.com/github/promigo-ventures/3MTT_Assiggnment/blob/main/AmazonReviewData_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas scikit-learn nltk



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

# Download nltk resources
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Load the training data
def load_data(filepath, labeled=True):
    reviews = []
    sentiments = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if labeled:
                # Split the label from the text
                label, text = line.split(' ', 1)
                sentiment = 1 if '__label__1' in label else 0  # Assuming __label__1 is positive, __label__0 is negative
                sentiments.append(sentiment)
                reviews.append(text.strip())
            else:
                # For unlabeled data, just read the text
                reviews.append(line.strip())
    return (reviews, sentiments) if labeled else reviews

# Load training and test datasets
train_reviews, train_sentiments = load_data('train.ft.txt')
test_reviews = load_data('test.ft.txt', labeled=False)

# Create DataFrames for ease of handling
train_data = pd.DataFrame({'review': train_reviews, 'sentiment': train_sentiments})
test_data = pd.DataFrame({'review': test_reviews})


In [4]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters and lowercase the text
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    # Tokenize, remove stop words, and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
train_data['cleaned_review'] = train_data['review'].apply(preprocess_text)
test_data['cleaned_review'] = test_data['review'].apply(preprocess_text)


In [5]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit on the training data and transform both training and test sets
X_train_tfidf = tfidf.fit_transform(train_data['cleaned_review'])
X_test_tfidf = tfidf.transform(test_data['cleaned_review'])
y_train = train_data['sentiment']


In [6]:
# Initialize the classifier
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train_tfidf, y_train)


In [7]:
# Split training data for evaluation
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train on the split data and make predictions on validation data
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

# Print accuracy and classification report
print("Accuracy on Validation Set:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))


Accuracy on Validation Set: 0.8509686249736786
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85     19155
           1       0.86      0.84      0.85     18837

    accuracy                           0.85     37992
   macro avg       0.85      0.85      0.85     37992
weighted avg       0.85      0.85      0.85     37992



In [8]:
# Predict sentiment for test data
test_data['predicted_sentiment'] = model.predict(X_test_tfidf)

# Display a few predictions
print(test_data[['review', 'predicted_sentiment']].head())


                                              review  predicted_sentiment
0  __label__2 Great CD: My lovely Pat has one of ...                    0
1  __label__2 One of the best game music soundtra...                    0
2  __label__1 Batteries died within a year ...: I...                    1
3  __label__2 works fine, but Maha Energy is bett...                    1
4  __label__2 Great for the non-audiophile: Revie...                    0


In [9]:
# Save predictions to a CSV file
test_data[['review', 'predicted_sentiment']].to_csv('amazon_test_predictions.csv', index=False)
