### Amazon Sentiment Reviews

## Convert Dataset to CSV File and Reduce to 1000 Reviews

In [1]:
import pandas as pd

rows = []
with open("train.ft.txt", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        label, text = line.split(" ", 1)
        rows.append({
            "label": label.replace("__label__", ""),
            "review_text": text.strip()
        })

df = pd.DataFrame(rows)
df.to_csv("raw_amazon_reviews_1000.csv", index=False)

df.head()

Unnamed: 0,label,review_text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."


## Load Dataset

In [2]:
import pandas as pd

df = pd.read_csv("raw_amazon_reviews_1000.csv")
df.head()

Unnamed: 0,label,review_text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."


## Import NLP Tools

In [3]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Robert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Robert\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Robert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Robert\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Define Preprocessing Function

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Tokenize
    tokens = word_tokenize(text)

    # 3. Remove punctuation and stopwords
    tokens = [
        token for token in tokens
        if token.isalpha() and token not in stop_words
    ]

    # 4. Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

## Apply Preprocessing

In [5]:
df["clean_tokens"] = df["review_text"].apply(preprocess_text)
df.head()

Unnamed: 0,label,review_text,clean_tokens
0,2,Stuning even for the non-gamer: This sound tra...,"[stuning, even, sound, track, beautiful, paint..."
1,2,The best soundtrack ever to anything.: I'm rea...,"[best, soundtrack, ever, anything, reading, lo..."
2,2,Amazing!: This soundtrack is my favorite music...,"[amazing, soundtrack, favorite, music, time, h..."
3,2,Excellent Soundtrack: I truly like this soundt...,"[excellent, soundtrack, truly, like, soundtrac..."
4,2,"Remember, Pull Your Jaw Off The Floor After He...","[remember, pull, jaw, floor, hearing, played, ..."


## Join Tokens Back to Text

In [6]:
df["clean_text"] = df["clean_tokens"].apply(lambda x: " ".join(x))

## Save Cleaned Dataset

In [7]:
df.to_csv("amazon_reviews_1000_cleaned.csv", index=False)