In [None]:
#downsampling

import pandas as pd
from sklearn.utils import resample


In [None]:

# Load the dataset (adjust the file path as needed)
file_path = "/content/Reviews.csv"  # Change this to your actual file path
df = pd.read_csv(file_path)

In [None]:
# Identify columns to drop
columns_to_drop = ['Id',  'UserId', 'ProfileName','ProductId',
                   'HelpfulnessNumerator', 'HelpfulnessDenominator',
                   'Time', 'Summary']

# Drop unnecessary columns
dataset = df.drop(columns=columns_to_drop)

# Check the updated dataset
print(dataset.head())
print(dataset.info())


   Score                                               Text
0      5  I have bought several of the Vitality canned d...
1      1  Product arrived labeled as Jumbo Salted Peanut...
2      4  This is a confection that has been around a fe...
3      2  If you are looking for the secret ingredient i...
4      5  Great taffy at a great price.  There was a wid...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Score   568454 non-null  int64 
 1   Text    568454 non-null  object
dtypes: int64(1), object(1)
memory usage: 8.7+ MB
None


In [None]:
# Reapply Sentiment conversion while keeping Score
dataset['Sentiment'] = dataset['Score'].apply(lambda x: 'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral'))

# Confirm Score exists
print(dataset.head())  # Check if Score is present

   Score                                               Text Sentiment
0      5  I have bought several of the Vitality canned d...  positive
1      1  Product arrived labeled as Jumbo Salted Peanut...  negative
2      4  This is a confection that has been around a fe...  positive
3      2  If you are looking for the secret ingredient i...  negative
4      5  Great taffy at a great price.  There was a wid...  positive


In [None]:
# Check initial class distribution
print("Original class distribution:")
print(dataset['Sentiment'].value_counts())

Original class distribution:
Sentiment
positive    443777
negative     82037
neutral      42640
Name: count, dtype: int64


In [None]:
# Separate classes
positive = dataset[dataset['Sentiment'] == 'positive']
negative = dataset[dataset['Sentiment'] == 'negative']
neutral = dataset[dataset['Sentiment'] == 'neutral']

In [None]:
print(dataset[['Score', 'Sentiment','Text']].head())


   Score Sentiment                                               Text
0      5  positive  I have bought several of the Vitality canned d...
1      1  negative  Product arrived labeled as Jumbo Salted Peanut...
2      4  positive  This is a confection that has been around a fe...
3      2  negative  If you are looking for the secret ingredient i...
4      5  positive  Great taffy at a great price.  There was a wid...


In [None]:
# Downsample each class
positive_downsampled = resample(positive, replace=False, n_samples=15000, random_state=42)
negative_downsampled = resample(negative, replace=False, n_samples=15000, random_state=42)
neutral_downsampled = resample(neutral, replace=False, n_samples=15000, random_state=42)


In [None]:
# Combine downsampled data
balanced_df = pd.concat([positive_downsampled, negative_downsampled, neutral_downsampled])

In [None]:
# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
print(balanced_df['Sentiment'].value_counts())

Sentiment
neutral     15000
positive    15000
negative    15000
Name: count, dtype: int64


In [None]:
print(dataset[['Score', 'Sentiment','Text']].head())


   Score Sentiment                                               Text
0      5  positive  I have bought several of the Vitality canned d...
1      1  negative  Product arrived labeled as Jumbo Salted Peanut...
2      4  positive  This is a confection that has been around a fe...
3      2  negative  If you are looking for the secret ingredient i...
4      5  positive  Great taffy at a great price.  There was a wid...


In [None]:
# Save the balanced dataset
balanced_df.to_csv("/content/balanced_dataset00.csv", index=False)

In [None]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [None]:
#preprocessing
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from wordcloud import STOPWORDS
import contractions
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Download NLTK resources (run only once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load dataset
df = pd.read_csv("/content/balanced_dataset00.csv")

In [None]:
# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english")) | set(STOPWORDS)  # Combining NLTK & WordCloud stopwords


In [None]:
# Function to handle negations
def handle_negations(text):
    negations = {"not", "no", "never", "n't"}  # Common negation words
    words = text.split()
    processed_words = []

    i = 0
    while i < len(words):
        if words[i] in negations and i + 1 < len(words):  # If a negation is found
            processed_words.append(words[i] + "_" + words[i + 1])  # Combine negation with the next word
            i += 1  # Skip the next word since it's already combined
        else:
            processed_words.append(words[i])
        i += 1

    return " ".join(processed_words)


In [None]:

# Function for text cleaning + negation handling
def clean_text(text):
    text = str(text).lower()  # Lowercasing
    text = contractions.fix(text)  # Expanding contractions (e.g., "can't" -> "cannot")
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|\#\w+', '', text)  # Remove mentions (@user) and hashtags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
    text = handle_negations(text)  # Handle negations
    text = word_tokenize(text)  # Tokenization
    text = [lemmatizer.lemmatize(word, wordnet.VERB) for word in text if word not in stop_words]  # Lemmatization & Stopword removal
    text = " ".join(text)
    return text

In [None]:
# Apply preprocessing without dropping 'Score'
df['cleaned_text'] = df['Text'].astype(str).apply(clean_text)

# Ensure 'Score' is still present
print(df.head())  # Check if 'Score' is still there


   Score                                               Text Sentiment  \
0      3  As others have noted, this jerky is chopped an...   neutral   
1      5  My boyfriend and I LOVE this tea. Though it do...  positive   
2      3  I wanted to buy a whey protein that didn't hav...   neutral   
3      5  We love pop chips at our house, they are a gre...  positive   
4      3  Both of my cats like the chicken & brown rice ...   neutral   

                                        cleaned_text  
0  others note jerky chop form greasy taste rich ...  
1  boyfriend love tea though not_necessarily feel...  
2  want buy whey protein not_have artificial swee...  
3  love pop chip house great addition weight watc...  
4  cat chicken brown rice newmans dry food flavor...  


In [None]:
# Save cleaned dataset
df[['cleaned_text', 'Sentiment', 'Score']].to_csv("/content/cleaned_advanced_dataset(45k).csv", index=False)