# Sentiment Analysis - Extract Features
- Create sentiment features
- Train from scratch methods using traditional NLP/NLTK
- Also use pre trained models and compare all to randomly guessing sentiment
- Output training and testing sets

In [None]:
!pip install -q transformers twython langdetect pycountry nltk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Get dataset that had entropies and languages identified
df = pd.read_pickle('/kaggle/input/reviews-analyzer-dataset/df_w_langs_entropies.pkl')
print(df.columns)
df.rename(columns={'reviews': 'review', 'language': 'lang_code'}, inplace=True)
print(f"Total entries = {len(df)}")

In [None]:
df.rating.value_counts()

In [None]:
import pycountry
import warnings
from tqdm import tqdm

def get_language_name(code):
    """Convert ISO 639-1 code to full language name using pycountry."""
    if code == 'unknown':
        return 'Unknown'
    try:
        # Handle cases like 'zh-cn' by taking the first part
        code = code.split('-')[0]
        lang = pycountry.languages.get(alpha_2=code)
        return lang.name.lower() if lang else code
    except AttributeError:
        return code

# Do some basic processing for readability

tqdm.pandas(desc='Detecting Languages')
df['language'] = df['lang_code'].progress_apply(get_language_name)
print(df.rating.value_counts())
print(df.language.value_counts())

#### Remove Gibberish with Gibberish Detector Model
- Load and apply gibberish detector

In [None]:
def label_sentiment(entry):
    if entry > 3:
        return 'positive'
    elif entry == 3:
        return 'neutral'
    elif entry < 3:
        return 'negative'

# Function to vectorize with progress bar
def vectorize_with_progress(reviews, desc):
    with tqdm(total=1, desc=desc) as pbar:
        matrix = vectorizer.fit_transform(reviews)
        freq = matrix.toarray().sum(axis=0)
        pbar.update(1)
    return freq, vectorizer.get_feature_names_out()

In [None]:
# Grab NLTK stuff
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# What languages are available? 
#print(stopwords.fileids())

# Get basic NLTK stuff
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Only run once to get lexicon
!unzip /usr/share/nltk_data/sentiment/vader_lexicon.zip

# What were the top languages we had in our course reviews?
nltk_set = set(stopwords.fileids())
course_set = set(df.language)
overlapping_languages = list(nltk_set.intersection(course_set))

# Get stop words for all languages in dict form
stop_words_dict = {lang: set(stopwords.words(lang)) for lang in overlapping_languages}

# Combine stop words from all languages
all_stop_words = set()
for lang in overlapping_languages:
    try:
        lang_stop_words = set(stopwords.words(lang))
        all_stop_words.update(lang_stop_words)
    except ValueError as e:
        print(f"No stop words for {lang}: {e}")

print(f"Total stop words: {len(all_stop_words)} (sample: {list(all_stop_words)[:5]})")

# Load VADER words instead of listing by hand
vader_lexicon_path = '/kaggle/working/vader_lexicon/vader_lexicon.txt'
vader_scores = pd.read_csv(vader_lexicon_path, sep='\t', header=None, 
                           names=['word', 'score', 'std_dev', 'count'])
vader_dict = dict(zip(vader_scores['word'], vader_scores['score']))

In [None]:
# Explore existing bi-grams and tri-grams
from sklearn.feature_extraction.text import CountVectorizer

# Split by sentiment
df['sentiment_label'] = df['rating'].apply(label_sentiment)
pos_reviews = df[df['sentiment_label'] == 'positive']['review']
neg_reviews = df[df['sentiment_label'] == 'negative']['review']

# Handle NaN/non-string values
pos_reviews = pos_reviews.fillna('')  # Replace NaN with empty string
pos_reviews = pos_reviews.astype(str)  # Ensure all are strings
neg_reviews = neg_reviews.fillna('').astype(str)

# Extract top n-grams
vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words=list(all_stop_words), max_features=20)

# Extract top n-grams with progress
print("Extracting n-grams...")
pos_ngrams_freq, pos_ngrams_names = vectorize_with_progress(pos_reviews, "Processing Positive Reviews")
pos_ngrams_dict = dict(zip(pos_ngrams_names, pos_ngrams_freq))

if len(neg_reviews) > 0:
    neg_ngrams_freq, neg_ngrams_names = vectorize_with_progress(neg_reviews, "Processing Negative Reviews")
    neg_ngrams_dict = dict(zip(neg_ngrams_names, neg_ngrams_freq))
else:
    neg_ngrams_dict = {}
    print("No negative reviews to process.")

print("Top Positive N-grams:", pos_ngrams_dict)
print("Top Negative N-grams:", neg_ngrams_dict)

In [None]:
# Assign polarity scores
def assign_polarity(ngrams_dict, base_score, adjustment=0.1):
    polarity_dict = {}
    for ngram, freq in ngrams_dict.items():
        extra = (freq - 1) * adjustment if freq > 1 else 0
        polarity_dict[ngram] = base_score + extra
    return polarity_dict

positive_ngrams = assign_polarity(pos_ngrams_dict, base_score=2.0, adjustment=0.1)
negative_ngrams = assign_polarity(neg_ngrams_dict, base_score=-2.0, adjustment=-0.1)

print("Positive N-grams with Polarity:", positive_ngrams)
print("Negative N-grams with Polarity:", negative_ngrams)

In [None]:
negation_words = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

In [None]:
# Update feature extraction with VADER scores
def extract_features_with_vader(text):
    tokens = word_tokenize(text.lower())
    #tokens = [w for w in tokens if w not in list(all_stop_words)]
    # Loop with progress bar
    filtered_tokens = []
    for w in tqdm(tokens, desc="Filtering Tokens", leave=False):
        if w not in list(all_stop_words):
            filtered_tokens.append(w)
    tokens = filtered_tokens
    # Unigram features
    pos_count = 0
    neg_count = 0
    negated_pos_count = 0
    negated_neg_count = 0
    polarity = 0
    negated = False
    
    # N-gram features
    pos_ngram_count = 0
    neg_ngram_count = 0
    
    # Generate bigrams and trigrams
    bigrams = [' '.join(tokens[i:i+2]) for i in range(len(tokens)-1)]
    trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)]
    ngrams = bigrams + trigrams
    
    # N-gram scoring
    for ngram in ngrams:
        if ngram in positive_ngrams:
            pos_ngram_count += 1
            polarity += positive_ngrams[ngram]
        elif ngram in negative_ngrams:
            neg_ngram_count += 1
            polarity += negative_ngrams[ngram]
    
    # Unigram scoring with negation
    for i, token in enumerate(tokens):
        if token in negation_words:
            negated = True
            continue
        
        score = vader_dict.get(token, 0)
        if negated and token not in {'.', ',', ';', '!'}:
            if score > 0:
                negated_pos_count += 1
                polarity += score * -0.5
            elif score < 0:
                negated_neg_count += 1
                polarity += score * -0.5
            negated = False
        else:
            if score > 0:
                pos_count += 1
                polarity += score
            elif score < 0:
                neg_count += 1
                polarity += score
        
        if token in {'.', ',', ';', '!'}:
            negated = False
    
    exclamations = text.count('!')
    uppercase_ratio = sum(c.isupper() for c in text) / len(text) if len(text) > 0 else 0
    
    return pd.Series({
        'pos_word_count': pos_count,
        'neg_word_count': neg_count,
        'negated_pos_count': negated_pos_count,
        'negated_neg_count': negated_neg_count,
        'pos_ngram_count': pos_ngram_count,
        'neg_ngram_count': neg_ngram_count,
        'polarity_score': polarity,
        'exclamation_count': exclamations,
        'uppercase_ratio': uppercase_ratio
    })


tqdm.pandas(desc='Extracting features...')
#df = df.head().copy()
features = df['review'].progress_apply(extract_features_with_vader)

In [None]:
features.columns

In [None]:
from sklearn.model_selection import train_test_split
import os
import pickle
import zipfile

#df['sentiment'] = df['rating'].apply(lambda x: 1 if x >= 4 else (0 if x <= 2 else None))
features['sentiment'] = df['rating'].apply(lambda x: 2 if x >= 4 else (1 if x == 3 else 0))
#data = data.dropna(subset=['sentiment'])  # Drop neutral (3)

# Feature columns
feature_cols = ['pos_word_count', 'neg_word_count', 'negated_pos_count', 'negated_neg_count',
                'pos_ngram_count', 'neg_ngram_count', 'polarity_score', 'exclamation_count', 'uppercase_ratio']
X = pd.concat([features[feature_cols], df['review']], axis=1)
y = features['sentiment']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

# Save as pickle files
output_dir = '/kaggle/working/'  # Kaggle default; adjust for local (e.g., './data/')
os.makedirs(output_dir, exist_ok=True)

files_to_save = {
    'X_train.pkl': X_train,
    'X_test.pkl': X_test,
    'y_train.pkl': y_train,
    'y_test.pkl': y_test
}

print("Saving datasets as pickle files...")
for filename, obj in tqdm(files_to_save.items(), desc="Saving Pickle Files"):
    with open(os.path.join(output_dir, filename), 'wb') as f:
        pickle.dump(obj, f)

print(f"Saved pickle files to {output_dir}: {list(files_to_save.keys())}")

# Zip the pickle files
zip_filename = os.path.join(output_dir, 'sentiment_data.zip')
print("Zipping files...")
with zipfile.ZipFile(zip_filename, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    for filename in tqdm(files_to_save.keys(), desc="Adding Files to Zip"):
        zipf.write(os.path.join(output_dir, filename), arcname=filename)

print(f"Saved and zipped files to {zip_filename}")

**NOTE:** Looks like we lose like 600 reviews in the inner join? Maybe we dump the course info if not needed.

In [None]:
'''
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

def summarize(text):
    inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors='pt')
    summary_ids = model.generate(inputs['input_ids'], max_length=50, min_length=10, length_penalty=2.0, num_beams=4)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

real_reviews['summary'] = real_reviews['review_text'].apply(summarize)
'''

In [None]:
'''
from transformers import pipeline
sentiment_classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
real_reviews['sentiment'] = real_reviews['review_text'].apply(lambda x: sentiment_classifier(x)[0]['label'])
'''

### '''
real_reviews['rating_sentiment'] = real_reviews['rating'].apply(lambda x: 'POSITIVE' if x >= 4 else 'NEGATIVE')
accuracy = (real_reviews['sentiment'] == real_reviews['rating_sentiment']).mean()
print(f"Sentiment Accuracy (using ratings): {accuracy:.2f}")
''''''