In [1]:
!pip -q install transformers huggingface_hub langdetect pycountry

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import scipy.stats as stats
import re
import nltk
from nltk.corpus import words
from collections import Counter
import math
import unicodedata
import time
from tqdm import tqdm
import pickle
from langdetect import detect
import warnings
import pycountry
from scipy import stats
from collections import defaultdict
import itertools
import os

In [3]:
# Load sentiment data with sentiment features calculated
X_train = pd.read_pickle('/kaggle/input/reviews-analyzer-dataset/sentiment_data/X_train.pkl')
y_train = pd.read_pickle('/kaggle/input/reviews-analyzer-dataset/sentiment_data/y_train.pkl')
X_test = pd.read_pickle('/kaggle/input/reviews-analyzer-dataset/sentiment_data/X_test.pkl')
y_test = pd.read_pickle('/kaggle/input/reviews-analyzer-dataset/sentiment_data/y_test.pkl')

# Recombine becuase we forgot to not split before
sample_size = 100
file_path = '/kaggle/working/embeddings.npy'
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"File '{file_path}' has been deleted.")
else:
    print(f"File '{file_path}' does not exist.")

train_df = pd.concat([X_train, y_train], axis=1).sample(n=sample_size)
test_df = pd.concat([X_test, y_test], axis=1).sample(n=sample_size)

print(train_df.columns)

File '/kaggle/working/embeddings.npy' does not exist.
Index(['pos_word_count', 'neg_word_count', 'negated_pos_count',
       'negated_neg_count', 'pos_ngram_count', 'neg_ngram_count',
       'polarity_score', 'exclamation_count', 'uppercase_ratio', 'review',
       'sentiment'],
      dtype='object')


In [4]:
# FEATURE - Calculate entropy
def calculate_entropy(text):
    """Calculate Shannon entropy of the text to detect randomness."""
    if not text:
        return 0
    if not isinstance(text, str) or pd.isna(text):
        return 0  # Return 0 for NaN or non-string values
    text = str(text).lower()
    length = len(text)
    if length == 0:  # Handle empty strings
        return 0
    char_counts = Counter(text)
    entropy = -sum((count/length) * math.log2(count/length) for count in char_counts.values())
    return entropy

def create_entropy_feature(df, review_col='review'):
    tqdm.pandas(desc='Calculating entropies: ')
    df['entropy'] = df['review'].progress_apply(calculate_entropy)
    return df

# Suppress langdetect warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

def detect_language(text):

    if not isinstance(text, str) or pd.isna(text) or len(text.strip()) < 3:
        return 'unknown'  # For NaN, empty, or very short text
    try:
        return detect(text)
    except:
        return 'unknown'  # Fallback for any detection errors

# Returns 0 if we can't find the langauge, 1 if we can
def cannot_detect_language(text):
    if text == 'unknown':
        return 1
    else:
        return 0

def create_can_detect_feature(df, review_col='review'):
    tqdm.pandas(desc="Detecting Language...")
    df['language'] = df[review_col].progress_apply(detect_language)
    df['cannot_detect_language'] = df['language'].progress_apply(cannot_detect_language)
    return df

# 3x FEATURES - Total characters in review, word count of review, avg word length
def word_count(text):
    words = re.split(f'\s+', text.strip())
    word_count = len(words)
    return word_count

def char_count(text):
    return len(text)

def create_word_and_char_counts_feature(df, review_col='review'):
    tqdm.pandas(desc='Getting word/char counts...')
    df['word_count'] = df[review_col].progress_apply(word_count)
    df['n_chars'] = df[review_col].progress_apply(char_count)
    return df

def get_avg_word_length(text):
    # avg word length
    words = re.split(f'\s+', text.strip())
    word_count = len(words)
    avg_word_length = sum(len(word) for word in words if word) / max(1, word_count) if words else 0
    return avg_word_length

def create_avg_word_length_feature(df, review_col='review'):
    tqdm.pandas(desc='Getting avg word length feature...')
    df['avg_word_length'] = df[review_col].progress_apply(get_avg_word_length)
    return df

# FEATURE - Amount of Reptition
def get_max_repeated(text):
    max_repeats = max([sum(1 for _ in g) for _, g in itertools.groupby(text)] or [0])
    return max_repeats
def create_repetition_feature(df, review_col='review'):
    tqdm.pandas(desc='Creating repetition feature...')
    df['max_repeated'] = df[review_col].progress_apply(get_max_repeated)
    return df

def get_punct_ratio(text):
    char_length = len(text)
    punct_count = sum(1 for c in text if c in '.,!?')
    punct_ratio = punct_count / max(1, char_length)
    return punct_ratio
    
def create_punct_ratio_feature(df, review_col='review'):
    tqdm.pandas(desc='Creating punctuation ratio feature...')
    df['punct_ratio'] = df[review_col].progress_apply(get_punct_ratio)
    return df

# FEATURE - Contains common n-grams
# Step 1 - Build n-gram reference from sample of real reviews
def build_ngram_reference(texts, n=2, top_k=1000, sample_size=10000):

    # Sample texts to avoid over-processing (e.g., 1.19M reviews)
    if len(texts) > sample_size:
        texts = np.random.choice(texts, sample_size, replace=False)
    
    # Generate n-grams
    ngrams = Counter()
    for text in tqdm(texts, desc="Building n-gram reference..."):
        text = str(text).lower()
        for i in range(len(text) - n + 1):
            ngram = text[i:i+n]
            if not ngram.isspace():
                ngrams[ngram] += 1
    
    # Return top k most common n-grams
    return set([ngram for ngram, _ in ngrams.most_common(top_k)])

# FEATURE - ngram coherence, fraction of ngrams that appear in list of common ngrams
def get_ngram_coherence(text, n=2):
    text_lower = text.lower()
    total_ngrams = max(1, len(text_lower) - n + 1)
    valid_ngrams = sum(1 for i in range(total_ngrams) if text_lower[i:i+n] in ngram_ref)
    ngram_coherence = valid_ngrams / total_ngrams
    return ngram_coherence

def create_ngram_coherence_feature(df, ngram_ref, review_col='review'):
    tqdm.pandas(desc='Calcualting ngram coherenece...')
    df['ngram_coherence'] = df[review_col].progress_apply(get_ngram_coherence)
    return df

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained XLM-R model and tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Function to get embeddings in batches
def get_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        # Use [CLS] token embedding (first token)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)
    
# Compute centroid from substantive reviews
def compute_centroid(df, review_col='review', label_col='is_gibberish', sample_size=10000):
    # Use real reviews (training data only)
    real_texts = df[df[label_col] == 0][review_col].dropna().tolist()
    if len(real_texts) > sample_size:
        real_texts = np.random.choice(real_texts, sample_size, replace=False).tolist()
    embeddings = get_embeddings(real_texts)
    return np.mean(embeddings, axis=0)

# Add embedding-based features
def add_embedding_features(df, centroid, review_col='review', embed_path=None):
    if embed_path and os.path.exists(embed_path):
        print(f"Loading embeddings from {embed_path}")
        embeddings = np.load(embed_path)
    else:
        texts = df[review_col].fillna('').tolist()
        embeddings = get_embeddings(texts)
        if embed_path:
            np.save(embed_path, embeddings)
            print(f"Saved embeddings to {embed_path}")
    
    # Cosine similarity to centroid
    cosine_sim = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten()
    
    # Anomaly score (Euclidean distance)
    anomaly_score = np.linalg.norm(embeddings - centroid, axis=1)
    
    df['cosine_to_centroid'] = cosine_sim
    df['anomaly_score'] = anomaly_score
    return df

In [None]:
def create_feature_df(df, review_col='review', ngram_ref=None, centroid=None):
    df = create_entropy_feature(df, review_col='review')
    df = create_can_detect_feature(df, review_col='review')
    #df = create_alphabet_tag_feature(df, review_col='review')
    df = create_word_and_char_counts_feature(df, review_col='review')
    df = create_avg_word_length_feature(df, review_col='review')
    df = create_repetition_feature(df, review_col='review')
    df = create_punct_ratio_feature(df, review_col='review')
    df = create_ngram_coherence_feature(df, ngram_ref, review_col='review')
    df = add_embedding_features(df, centroid)
    #train_embed_path = 'train_embeddings.npy'
    #test_embed_path = 'test_embeddings.npy'
    return df

In [None]:
# Load precomputed centroid and ngram_ref from Amazon full training set
with open('/kaggle/input/reviews-analyzer-dataset/coursera_gibberish/ngram_ref.pkl', 'rb') as f:
    ngram_ref = pickle.load(f)
with open('/kaggle/input/reviews-analyzer-dataset/coursera_gibberish/centroid.pkl', 'rb') as f:
    centroid = pickle.load(f)
    
# Step 3: Create feature DataFrames
train_features = create_feature_df(train_df, ngram_ref=ngram_ref, centroid=centroid)
test_features = create_feature_df(test_df, ngram_ref=ngram_ref, centroid=centroid)

In [None]:
train_features.columns

In [None]:
# Choose the relevant columns
model_features = ['cannot_detect_language', 'entropy', 'word_count', 'avg_word_length',
                  'ngram_coherence', 'anomaly_score', 'punct_ratio', 'max_repeated']
feed_into_model_df = train_features.reindex(columns=model_features)

# Run the model on coursera stuff
model_path = '/kaggle/input/reviews-analyzer-dataset/gibberish_random_forest_model.pkl'
with open(model_path, 'rb') as f:
    model = pickle.load(f)
feed_into_model_df['is_gibberish'] = model.predict(feed_into_model_df)
feed_into_model_df['review'] = train_features['review']
feed_into_model_df['polarity_score'] = train_features['polarity_score']

In [None]:
feed_into_model_df.is_gibberish.value_counts()

In [None]:
# Add a simple rule for short reviews
def adjust_predictions(X, predictions, threshold=5, polarity_min=0.3):
    adjusted_preds = predictions.copy()
    for i in range(len(X)):
        total_words = X.iloc[i]['pos_word_count'] + X.iloc[i]['neg_word_count']
        polarity = X.iloc[i]['polarity_score']
        if total_words < threshold and polarity > polarity_min:
            adjusted_preds[i] = 'non-gibberish'  # Override for short, positive reviews
    return adjusted_preds

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Assuming your dataframe is called 'df'
def explore_gibberish_reviews(df):
    
    # 1. Basic statistics
    print("Basic Statistics:")
    print(df.describe())
    
    # 2. Distribution of numerical features by sentiment
    numerical_cols = ['cannot_detect_language', 'entropy', 'word_count', 'avg_word_length',
                  'ngram_coherence', 'anomaly_score', 'punct_ratio', 'max_repeated']
    
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    axes = axes.ravel()
    
    for idx, col in enumerate(numerical_cols):
        sns.boxplot(data=df, x='is_gibberish', y=col, ax=axes[idx])
        #sns.histplot(data=df, x=)
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # 3. Correlation heatmap
    plt.figure(figsize=(10, 8))
    correlation_matrix = df[numerical_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Between Features')
    plt.show()
   
    # 6. Word count analysis
    df['total_word_count'] = df['pos_word_count'] + df['neg_word_count']
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x='total_word_count', y='polarity_score', 
                   hue='is_gibberish', alpha=0.6)
    plt.title('Word Count vs Polarity Score by Gibberish')
    plt.show()
    

explore_gibberish_reviews(feed_into_model_df)

In [None]:
'''
# 4. Analyze false positives (non-gibberish labeled as gibberish)
    # Assuming 'sentiment' column has 'gibberish' and 'non-gibberish' labels
    false_positives = df[(df['review'].str.contains('great', case=False))]
    
    print(f"\nNumber of false positives with 'great': {len(false_positives)}")
    print("\nSample of false positives:")
    print(false_positives[['cannot_detect_language', 'entropy', 'word_count', 
                           'avg_word_length','ngram_coherence', 'anomaly_score', 
                           'punct_ratio', 'max_repeated', 'review', 
                           'cannot_detect_language']].head())
    
    # 5. Feature distributions for false positives vs all data
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 3, 1)
    sns.kdeplot(data=df, x='polarity_score', label='All')
    sns.kdeplot(data=false_positives, x='polarity_score', label='False Positives')
    plt.title('Polarity Score Distribution')
    plt.legend()
    
    plt.subplot(1, 3, 2)
    sns.kdeplot(data=df, x='uppercase_ratio', label='All')
    sns.kdeplot(data=false_positives, x='uppercase_ratio', label='False Positives')
    plt.title('Uppercase Ratio Distribution')
    plt.legend()
    
    plt.subplot(1, 3, 3)
    sns.kdeplot(data=df, x='exclamation_count', label='All')
    sns.kdeplot(data=false_positives, x='exclamation_count', label='False Positives')
    plt.title('Exclamation Count Distribution')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

     # 7. Print some example reviews
    print("\nExample reviews labeled as gibberish with 'GREAT AND EXEMPLARY':")
    exemplar_cases = df[df['review'].str.contains('great and exemplary', 
                        case=False) & (df['sentiment'] == 'gibberish')]
    if not exemplar_cases.empty:
        print(exemplar_cases[['review'] + numerical_cols].head())
    else:
        print("No exact matches found for 'GREAT AND EXEMPLARY'")
    '''