In [1]:
# Importing necessary libraries
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Defining relative paths to files with data
train_file = "C:/Users/sebas/repos/Sentiment_Analysis_of_Amazon_Customer_Reviews/data/train.ft.txt"
test_file = "C:/Users/sebas/repos/Sentiment_Analysis_of_Amazon_Customer_Reviews/data/test.ft.txt"

In [3]:
# Loading train and test data
with open(train_file, 'r', encoding='utf-8') as f:
    train_data = f.readlines()

with open(test_file, 'r', encoding='utf-8') as f:
    test_data = f.readlines()

# Displaying a sample line from each dataset
print("Sample of train data:", train_data[0])
print("Sample of test data:", test_data[0])

Sample of train data: __label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^

Sample of test data: __label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thin

In [4]:
# Splitting labels and texts for train data
train_labels = [line.split(' ', 1)[0] for line in train_data]
train_texts = [line.split(' ', 1)[1].strip() for line in train_data]

# Splitting labels and texts for test data
test_labels = [line.split(' ', 1)[0] for line in test_data]
test_texts = [line.split(' ', 1)[1].strip() for line in test_data]

# Converting to DataFrame
train_df = pd.DataFrame({'label': train_labels, 'text': train_texts})
test_df = pd.DataFrame({'label': test_labels, 'text': test_texts})

# Displaying samples
print(train_df.head())
print(test_df.head())

        label                                               text
0  __label__2  Stuning even for the non-gamer: This sound tra...
1  __label__2  The best soundtrack ever to anything.: I'm rea...
2  __label__2  Amazing!: This soundtrack is my favorite music...
3  __label__2  Excellent Soundtrack: I truly like this soundt...
4  __label__2  Remember, Pull Your Jaw Off The Floor After He...
        label                                               text
0  __label__2  Great CD: My lovely Pat has one of the GREAT v...
1  __label__2  One of the best game music soundtracks - for a...
2  __label__1  Batteries died within a year ...: I bought thi...
3  __label__2  works fine, but Maha Energy is better: Check o...
4  __label__2  Great for the non-audiophile: Reviewed quite a...


In [5]:
# Mapping labels to integers
label_map = {"__label__1": 0, "__label__2": 1}
train_df['label'] = train_df['label'].map(label_map)
test_df['label'] = test_df['label'].map(label_map)

# Displaying updated DataFrame
print(train_df.head())
print(test_df.head())


   label                                               text
0      1  Stuning even for the non-gamer: This sound tra...
1      1  The best soundtrack ever to anything.: I'm rea...
2      1  Amazing!: This soundtrack is my favorite music...
3      1  Excellent Soundtrack: I truly like this soundt...
4      1  Remember, Pull Your Jaw Off The Floor After He...
   label                                               text
0      1  Great CD: My lovely Pat has one of the GREAT v...
1      1  One of the best game music soundtracks - for a...
2      0  Batteries died within a year ...: I bought thi...
3      1  works fine, but Maha Energy is better: Check o...
4      1  Great for the non-audiophile: Reviewed quite a...


In [6]:
# Checking label distribution
print("Train label distribution:")
print(train_df['label'].value_counts())

print("\nTest label distribution:")
print(test_df['label'].value_counts())


Train label distribution:
label
1    1800000
0    1800000
Name: count, dtype: int64

Test label distribution:
label
1    200000
0    200000
Name: count, dtype: int64


In [7]:
# Adding a column for text length
train_df['text_length'] = train_df['text'].apply(len)
test_df['text_length'] = test_df['text'].apply(len)

# Printing statistics for text lengths
print("Train text length statistics:")
print(train_df['text_length'].describe())

print("\nTest text length statistics:")
print(test_df['text_length'].describe())


Train text length statistics:
count    3.600000e+06
mean     4.316463e+02
std      2.375526e+02
min      7.200000e+01
25%      2.310000e+02
50%      3.830000e+02
75%      5.950000e+02
max      1.015000e+03
Name: text_length, dtype: float64

Test text length statistics:
count    400000.000000
mean        431.429630
std         237.435383
min          99.000000
25%         231.000000
50%         383.000000
75%         595.000000
max        1015.000000
Name: text_length, dtype: float64


In [8]:
# Defining a function to clean the text
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Removeing special characters
    text = re.sub(r"\s+", " ", text).strip()    # Removing extra spaces
    return text

# Applying the cleaning function
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# Displaying cleaned samples
print("Cleaned training data sample:")
print(train_df.head())

Cleaned training data sample:
   label                                               text  text_length
0      1  Stuning even for the nongamer This sound track...          426
1      1  The best soundtrack ever to anything Im readin...          509
2      1  Amazing This soundtrack is my favorite music o...          760
3      1  Excellent Soundtrack I truly like this soundtr...          743
4      1  Remember Pull Your Jaw Off The Floor After Hea...          481


In [9]:
# Initializing TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Transforming training and test data (sparse matrices)
X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])

# Adding labels
y_train = train_df['label']
y_test = test_df['label']

# Displaying shape of transformed data
print("TF-IDF train data shape:", X_train.shape)
print("TF-IDF test data shape:", X_test.shape)


TF-IDF train data shape: (3600000, 5000)
TF-IDF test data shape: (400000, 5000)


In [10]:
# Logistic Regression Model: Initialization, Training and Evaluation

# Initializing the logistic regression model
model = LogisticRegression(max_iter=1000)

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluating the model
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

# Classification report
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))


Training Accuracy: 0.8871375
Test Accuracy: 0.8865125

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.89      0.88      0.89    200000
           1       0.88      0.89      0.89    200000

    accuracy                           0.89    400000
   macro avg       0.89      0.89      0.89    400000
weighted avg       0.89      0.89      0.89    400000



In [11]:
# Example comments for sentiment prediction
example_comments = [
    "This product is amazing! I loved every moment using it.",
    "Terrible experience. The item broke after one use.",
    "Decent quality and quite worth the price.",
    "Absolutely fantastic service, highly recommend!",
    "Too bad quality for this price.",
]

# Transforming comments using the fitted TF-IDF vectorizer
example_features = tfidf.transform(example_comments)

# Making predictions
example_predictions = model.predict(example_features)

# Mapping predictions to sentiment
sentiment_map = {1: "Positive", 0: "Negative"}
example_results = [(comment, sentiment_map[pred]) for comment, pred in zip(example_comments, example_predictions)]

# Displaying results
for comment, sentiment in example_results:
    print(f"Comment: {comment}")
    print(f"Predicted Sentiment: {sentiment}")
    print("---")


Comment: This product is amazing! I loved every moment using it.
Predicted Sentiment: Positive
---
Comment: Terrible experience. The item broke after one use.
Predicted Sentiment: Negative
---
Comment: Decent quality and quite worth the price.
Predicted Sentiment: Positive
---
Comment: Absolutely fantastic service, highly recommend!
Predicted Sentiment: Positive
---
Comment: Too bad quality for this price.
Predicted Sentiment: Negative
---
