# Gibberish Detector
## "Gibberish" vs. "Substantive"
- **Gibberish:** Random strings (e.g., "asdfjkl"), typos ("jhkd"), or meaningless repetition ("aaa").
- **Substantive:** Coherent sentences with intent (e.g., "Great course, learned a lot" or "这是一个很好的课程").
- **Edge Cases:** Short but valid reviews (e.g., "Good"), multilingual mixes, or sarcasm.

In [1]:
!pip -q install transformers huggingface_hub langdetect pycountry

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import scipy.stats as stats
import re
import nltk
from nltk.corpus import words
from collections import Counter
import math
import unicodedata
import time
from tqdm import tqdm
import pickle
from langdetect import detect
import warnings
import pycountry
from scipy import stats
from collections import defaultdict
import itertools

---------------------BEGIN DATA IMPORT------------

### Importing Data
- The original data used an older character encoding
- We'll save them as utf-8 so we can add the csv's to our project dataset

In [None]:
'''
# Try different encodings
file_path = '/kaggle/input/gibberish-text-classification/Amazon.csv'  # Replace with your file path
try:
    df = pd.read_csv(file_path, encoding='utf-8')  # Default, might fail
except UnicodeDecodeError:
    print("UTF-8 failed, trying other encodings...")
    try:
        df = pd.read_csv(file_path, encoding='windows-1252')
        print("Loaded with Windows-1252")
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file_path, encoding='iso-8859-1')
            print("Loaded with ISO-8859-1")
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(file_path, encoding='utf-16')
                print("Loaded with UTF-16")
            except UnicodeDecodeError:
                print("All common encodings failed. Check file encoding or corruption.")
'''                

In [None]:
'''
# Load with Windows-1252, save as UTF-8 for adding to our custom kaggle dataset
def convert_encoding_type(file, out):
    # Convert data to more universal encoding type
    df = pd.read_csv(file, encoding='windows-1252')
    df.to_csv(out, encoding='utf-8', index=False)
    print(f"Converted and saved as {out}")
    return

convert_encoding_type('/kaggle/input/gibberish-text-classification/Amazon.csv', 'amazon_reviews.csv')
convert_encoding_type('/kaggle/input/gibberish-text-classification/Gibberish.csv', 'amazon_gibberish.csv')
'''

-----------------------------END DATA CREATION---------------------


## EDA and Feature Extraction

In [3]:
reviews_df = pd.read_csv('/kaggle/input/gibberish-text-classification/Amazon.csv', encoding='windows-1252', header=None, names=['rating', 'review'])
gibb_df = pd.read_csv('/kaggle/input/gibberish-text-classification/Gibberish.csv', encoding='windows-1252')

According to the original data set at https://www.kaggle.com/datasets/bittlingmayer/amazonreviews, __label__2 means 4 and 5 star product reviews and label 1 is 1 and 2 start reviews. Looks like there's roughly equal amounts of both (positive and negative sentiment)

In [None]:
reviews_df.rating.value_counts()

In [None]:
reviews_df.head()

In [4]:
# Create sentiment column with 1 as positive (4/5 star), 0 negative (1,2 star)
reviews_df['sentiment'] = (reviews_df['rating'] == '__label__2')*1
reviews_df['is_gibberish'] = 0

There is a massive class imbalance with over 1 million real reviews and only 3767 gibberish reviews. We'll need to do some resampling.

In [5]:
gibb_df.rename(columns={'Response': 'review', 'Label': 'is_gibberish'}, inplace=True)

In [6]:
# Make new merged dataframe of just the reviews and whether they are gibberish
merged_df = pd.concat([gibb_df, reviews_df[['review', 'is_gibberish']]], ignore_index=True)
merged_df

Unnamed: 0,review,is_gibberish
0,ggg,1
1,hgghghghghghghhg,1
2,ufdhgjndfnvbhfdjvnjkmfgbdfg,1
3,dbdbdbd,1
4,dfgdfgd,1
...,...,...
1052338,Cheap and flimsy: This was bought for an event...,0
1052339,Total waste of money: This was a total waste o...,0
1052340,Whitmor budget garment rack: I purchased the W...,0
1052341,Serves its purpose: I bought this to put in my...,0


In [7]:
len(merged_df) - len(gibb_df) - len(reviews_df)

0

### Functions to Build Features

In [8]:
# FEATURE - Calculate entropy
def calculate_entropy(text):
    """Calculate Shannon entropy of the text to detect randomness."""
    if not text:
        return 0
    if not isinstance(text, str) or pd.isna(text):
        return 0  # Return 0 for NaN or non-string values
    text = str(text).lower()
    length = len(text)
    if length == 0:  # Handle empty strings
        return 0
    char_counts = Counter(text)
    entropy = -sum((count/length) * math.log2(count/length) for count in char_counts.values())
    return entropy

def create_entropy_feature(df, review_col='review'):
    tqdm.pandas(desc='Calculating entropies: ')
    df['entropy'] = df['review'].progress_apply(calculate_entropy)
    return df

In [9]:
# FEATURE - Can detect language
from langdetect import detect
import warnings

# Suppress langdetect warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

def detect_language(text):

    if not isinstance(text, str) or pd.isna(text) or len(text.strip()) < 3:
        return 'unknown'  # For NaN, empty, or very short text
    try:
        return detect(text)
    except:
        return 'unknown'  # Fallback for any detection errors

# Returns 0 if we can't find the langauge, 1 if we can
def cannot_detect_language(text):
    if text == 'unknown':
        return 1
    else:
        return 0

def create_can_detect_feature(df, review_col='review'):
    tqdm.pandas(desc="Detecting Language...")
    df['language'] = df[review_col].progress_apply(detect_language)
    df['cannot_detect_language'] = df['language'].progress_apply(cannot_detect_language)
    return df

The next feature we'll look at is a flag of type of a few main alphabets. In our EDA we showed that outside of English, the most popular of our Coursera review languages mostly used a Latin or very close to Latin alphabet (somali, afrikaans, and tagalog for example use some variation of latin). We'll make flags for 
1. chinese
2. cyrillic (for russian, slovenian)
3. hangul (for korean)
4. Latin or latin variant

In [10]:
# FEATURE - Type of Alphabet (run time 3 minutes)
def detect_alphabets(text):

    if not isinstance(text, str) or not text:
        return {
            'Chinese': {'present': False, 'count': 0},
            'Cyrillic': {'present': False, 'count': 0},
            'Hangul': {'present': False, 'count': 0},
            'Latin': {'present': False, 'count': 0}
        }
    
    # Define Unicode ranges
    ranges = {
        'Chinese': (0x4E00, 0x9FFF),         # CJK Unified Ideographs
        'Cyrillic': (0x0400, 0x04FF),       # Basic Cyrillic
        'Hangul': (0xAC00, 0xD7AF),         # Hangul Syllables
        'Latin': [(0x0000, 0x007F),         # Basic Latin
                  (0x00A0, 0x00FF),         # Latin-1 Supplement
                  (0x0100, 0x017F)],        # Latin Extended-A
    }
    
    # Count characters per alphabet
    alphabet_counts = defaultdict(int)
    for char in text:
        char_code = ord(char)
        
        # Check Chinese
        if ranges['Chinese'][0] <= char_code <= ranges['Chinese'][1]:
            alphabet_counts['Chinese'] += 1
        
        # Check Cyrillic
        if ranges['Cyrillic'][0] <= char_code <= ranges['Cyrillic'][1]:
            alphabet_counts['Cyrillic'] += 1
        
        # Check Hangul
        if ranges['Hangul'][0] <= char_code <= ranges['Hangul'][1]:
            alphabet_counts['Hangul'] += 1
        
        # Check Latin (multiple ranges)
        for start, end in ranges['Latin']:
            if start <= char_code <= end:
                alphabet_counts['Latin'] += 1
                break  # Stop after first match
    
    # Build result dictionary
    result = {
        'Chinese': {'present': alphabet_counts['Chinese'] > 0, 'count': alphabet_counts['Chinese']},
        'Cyrillic': {'present': alphabet_counts['Cyrillic'] > 0, 'count': alphabet_counts['Cyrillic']},
        'Abakada': {'present': alphabet_counts['Abakada'] > 0, 'count': alphabet_counts['Abakada']},
        'Hangul': {'present': alphabet_counts['Hangul'] > 0, 'count': alphabet_counts['Hangul']},
        'Latin': {'present': alphabet_counts['Latin'] > 0, 'count': alphabet_counts['Latin']}
    }
    
    return result

def create_alphabet_tag_feature(df, review_col='review'):
    # Create alphabets column with dictionary structure above
    tqdm.pandas(desc='Creating alphabet tagging feature...')
    df['alphabets'] = df[review_col].progress_apply(detect_alphabets)

    # Create one-hot tag of different alphabets
    df['has_chinese'] = df['alphabets'].apply(lambda x: x['Chinese']['present'])
    df['has_cyrillic'] = df['alphabets'].apply(lambda x: x['Cyrillic']['present'])
    df['has_abakada'] = df['alphabets'].apply(lambda x: x['Abakada']['present'])
    df['has_hangul'] = df['alphabets'].apply(lambda x: x['Hangul']['present'])
    df['has_latin'] = df['alphabets'].apply(lambda x: x['Latin']['present'])
    return df

In [11]:
# 3x FEATURES - Total characters in review, word count of review, avg word length
def word_count(text):
    words = re.split(f'\s+', text.strip())
    word_count = len(words)
    return word_count

def char_count(text):
    return len(text)

def create_word_and_char_counts_feature(df, review_col='review'):
    tqdm.pandas(desc='Getting word/char counts...')
    df['word_count'] = df[review_col].progress_apply(word_count)
    df['n_chars'] = df[review_col].progress_apply(char_count)
    return df

def get_avg_word_length(text):
    # avg word length
    words = re.split(f'\s+', text.strip())
    word_count = len(words)
    avg_word_length = sum(len(word) for word in words if word) / max(1, word_count) if words else 0
    return avg_word_length

def create_avg_word_length_feature(df, review_col='review'):
    tqdm.pandas(desc='Getting avg word length feature...')
    df['avg_word_length'] = df[review_col].progress_apply(get_avg_word_length)
    return df

In [12]:
# FEATURE - Amount of Reptition
def get_max_repeated(text):
    max_repeats = max([sum(1 for _ in g) for _, g in itertools.groupby(text)] or [0])
    return max_repeats
def create_repetition_feature(df, review_col='review'):
    tqdm.pandas(desc='Creating repetition feature...')
    df['max_repeated'] = df[review_col].progress_apply(get_max_repeated)
    return df

In [13]:
# FEATURE - Punctuation Ratio
def get_punct_ratio(text):
    char_length = len(text)
    punct_count = sum(1 for c in text if c in '.,!?')
    punct_ratio = punct_count / max(1, char_length)
    return punct_ratio
def create_punct_ratio_feature(df, review_col='review'):
    tqdm.pandas(desc='Creating punctuation ratio feature...')
    df['punct_ratio'] = df[review_col].progress_apply(get_punct_ratio)
    return df

In [14]:
# FEATURE - Contains common n-grams
# Step 1 - Build n-gram reference from sample of real reviews
def build_ngram_reference(texts, n=2, top_k=1000, sample_size=10000):
    """
    Build a set of common n-grams from a list of texts, assuming most are meaningful.
    WARNING - Must only be done on training set
    
    Parameters:
    - texts: List of text strings
    - n: N-gram size
    - top_k: Number of top n-grams to keep
    - sample_size: Number of texts to sample (to speed up)
    
    Returns:
    - Set of common n-grams
    """
    # Sample texts to avoid over-processing (e.g., 1.19M reviews)
    if len(texts) > sample_size:
        texts = np.random.choice(texts, sample_size, replace=False)
    
    # Generate n-grams
    ngrams = Counter()
    for text in tqdm(texts, desc="Building n-gram reference..."):
        text = str(text).lower()
        for i in range(len(text) - n + 1):
            ngram = text[i:i+n]
            if not ngram.isspace():
                ngrams[ngram] += 1
    
    # Return top k most common n-grams
    return set([ngram for ngram, _ in ngrams.most_common(top_k)])

In [15]:
# FEATURE - ngram coherence, fraction of ngrams that appear in list of common ngrams
def get_ngram_coherence(text, n=2):
    text_lower = text.lower()
    total_ngrams = max(1, len(text_lower) - n + 1)
    valid_ngrams = sum(1 for i in range(total_ngrams) if text_lower[i:i+n] in ngram_ref)
    ngram_coherence = valid_ngrams / total_ngrams
    return ngram_coherence

def create_ngram_coherence_feature(df, ngram_ref, review_col='review'):
    tqdm.pandas(desc='Calcualting ngram coherenece...')
    df['ngram_coherence'] = df[review_col].progress_apply(get_ngram_coherence)
    return df

In [20]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained XLM-R model and tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Function to get embeddings in batches
def get_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        # Use [CLS] token embedding (first token)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)
    
# Compute centroid from substantive reviews
def compute_centroid(df, review_col='review', label_col='is_gibberish', sample_size=10000):
    # Use real reviews (training data only)
    real_texts = df[df[label_col] == 0][review_col].dropna().tolist()
    if len(real_texts) > sample_size:
        real_texts = np.random.choice(real_texts, sample_size, replace=False).tolist()
    embeddings = get_embeddings(real_texts)
    return np.mean(embeddings, axis=0)

# Add embedding-based features
def add_embedding_features(df, centroid, review_col='review', embed_path=None):
    if embed_path and os.path.exists(embed_path):
        print(f"Loading embeddings from {embed_path}")
        embeddings = np.load(embed_path)
    else:
        texts = df[review_col].fillna('').tolist()
        embeddings = get_embeddings(texts)
        if embed_path:
            np.save(embed_path, embeddings)
            print(f"Saved embeddings to {embed_path}")
    
    # Cosine similarity to centroid
    cosine_sim = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten()
    
    # Anomaly score (Euclidean distance)
    anomaly_score = np.linalg.norm(embeddings - centroid, axis=1)
    
    df['cosine_to_centroid'] = cosine_sim
    df['anomaly_score'] = anomaly_score
    return df

In [21]:
def create_feature_df(df, review_col='review', ngram_ref=None, centroid=None):
    df = create_entropy_feature(df, review_col='review')
    df = create_can_detect_feature(df, review_col='review')
    df = create_alphabet_tag_feature(df, review_col='review')
    df = create_word_and_char_counts_feature(df, review_col='review')
    df = create_avg_word_length_feature(df, review_col='review')
    df = create_repetition_feature(df, review_col='review')
    df = create_punct_ratio_feature(df, review_col='review')
    df = create_ngram_coherence_feature(df, ngram_ref, review_col='review')
    df = add_embedding_features(df, centroid)
    #train_embed_path = 'train_embeddings.npy'
    #test_embed_path = 'test_embeddings.npy'
    return df

## Creating Training/Testing

In [22]:
# Create small sample to test the pipeline
experimental_df = pd.concat([merged_df.iloc[0:25, :], merged_df.iloc[-25:, :]], axis=0)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#X = test_df.drop(columns=['gibberish'])
#y = test_df['gibberish']

# Step 1: Stratified split
train_df, test_df = train_test_split(experimental_df, test_size=0.2, stratify=experimental_df['is_gibberish'], random_state=42)
print(f"Train: {len(train_df)} rows, {train_df['is_gibberish'].sum()} gibberish")
print(f"Test: {len(test_df)} rows, {test_df['is_gibberish'].sum()} gibberish")

# Step 2: Build n-gram reference from training data
ngram_ref = build_ngram_reference(train_df['review'].dropna().tolist())
centroid = compute_centroid(train_df)

# Step 3: Create feature DataFrames
train_features = create_feature_df(train_df, ngram_ref=ngram_ref, centroid=centroid)
test_features = create_feature_df(test_df, ngram_ref=ngram_ref, centroid=centroid)

# Save for later loading
train_features.to_pickle('gibberish_train.pkl')
test_features.to_pickle('gibberish_test.pkl')

# Step 4: Prepare X and y
X_train = train_features.drop(columns=['review', 'is_gibberish'])
y_train = train_features['is_gibberish']
X_test = test_features.drop(columns=['review', 'is_gibberish'])
y_test = test_features['is_gibberish']

# Diagnostics
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {len(y_train)}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {len(y_test)}")

# Save to reload later

Train: 40 rows, 20 gibberish
Test: 10 rows, 5 gibberish


Building n-gram reference...: 100%|██████████| 40/40 [00:00<00:00, 10966.22it/s]
Generating embeddings: 100%|██████████| 1/1 [00:04<00:00,  4.03s/it]
Calculating entropies: 100%|██████████| 40/40 [00:00<00:00, 28605.65it/s]
Detecting Language...: 100%|██████████| 40/40 [00:00<00:00, 63.23it/s]
Detecting Language...: 100%|██████████| 40/40 [00:00<00:00, 83760.44it/s]
Creating alphabet tagging feature...: 100%|██████████| 40/40 [00:00<00:00, 8175.63it/s]
Getting word/char counts...: 100%|██████████| 40/40 [00:00<00:00, 30355.01it/s]
Getting word/char counts...: 100%|██████████| 40/40 [00:00<00:00, 33968.85it/s]
Getting avg word length feature...: 100%|██████████| 40/40 [00:00<00:00, 28373.44it/s]
Creating repetition feature...: 100%|██████████| 40/40 [00:00<00:00, 8989.56it/s]
Creating punctuation ratio feature...: 100%|██████████| 40/40 [00:00<00:00, 45136.44it/s]
Calcualting ngram coherenece...: 100%|██████████| 40/40 [00:00<00:00, 16658.94it/s]
Generating embeddings: 100%|██████████| 

X_train shape: (40, 17)
y_train shape: 40
X_test shape: (10, 17)
y_test shape: 10





In [24]:
train_df.head()

Unnamed: 0,review,is_gibberish,entropy,language,cannot_detect_language,alphabets,has_chinese,has_cyrillic,has_abakada,has_hangul,has_latin,word_count,n_chars,avg_word_length,max_repeated,punct_ratio,ngram_coherence,cosine_to_centroid,anomaly_score
1052318,Very Helpful: This game guide helps. Alot! Gre...,0,4.136835,en,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,64,339,4.3125,2,0.017699,1.0,0.999379,0.661312
1052326,Very dissapointed with colors.: I chose to by ...,0,4.159115,en,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,47,251,4.361702,3,0.027888,1.0,0.998125,1.154295
14,fe er ger ger gre ger ger,1,2.152629,da,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,7,25,2.714286,1,0.0,1.0,0.996776,1.505363
4,dfgdfgd,1,1.556657,cy,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,1,7,7.0,1,0.0,1.0,0.9957,1.738235
1052333,"smart, insightful, funny: A fabulous fast read...",0,4.282886,en,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,57,367,5.45614,2,0.021798,1.0,0.99828,1.106473


In [25]:
test_df.head()

Unnamed: 0,review,is_gibberish,entropy,language,cannot_detect_language,alphabets,has_chinese,has_cyrillic,has_abakada,has_hangul,has_latin,word_count,n_chars,avg_word_length,max_repeated,punct_ratio,ngram_coherence,cosine_to_centroid,anomaly_score
1052328,Shocking results ... not the good kind mind yo...,0,4.284819,en,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,121,669,4.53719,3,0.03139,0.98503,0.999349,0.678035
1052324,Do not buy this one: I end up bought another f...,0,4.289984,en,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,44,239,4.454545,3,0.008368,0.962185,0.999036,0.823802
1052332,I keep going back to this gem: The Pawnshop Ch...,0,4.165865,en,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,65,381,4.876923,2,0.013123,0.984211,0.99958,0.543829
1052337,Good book: We like having a strange assortment...,0,4.030339,en,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,23,132,4.782609,2,0.015152,1.0,0.999458,0.618045
9,ddddddddddddddddddddddd,1,-0.0,cy,0,"{'Chinese': {'present': False, 'count': 0}, 'C...",False,False,False,False,True,1,23,23.0,23,0.0,0.0,0.996597,1.546895


In [None]:
'''
import pandas as pd
import numpy as np
from langdetect import detect
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re

# Feature extraction function
def extract_features(text):
    if not isinstance(text, str) or pd.isna(text):
        text = ""
    
    # Entropy
    length = len(text)
    if length == 0:
        entropy = 0
    else:
        char_counts = Counter(text.lower())
        entropy = -sum((count/length) * np.log2(count/length) for count in char_counts.values())
    
    # Language
    lang = detect(text) if length >= 3 else 'unknown'
    is_unknown = 1 if lang == 'unknown' else 0
    
    # Word count and common words (example for English)
    words = re.split(r'\s+', text.strip())
    word_count = len(words)
    common_words = {'good', 'great', 'bad', 'course', 'learn'}  # Expand per language
    common_word_ratio = sum(1 for w in words if w.lower() in common_words) / max(1, word_count)
    
    # Character length
    char_length = len(text)
    
    # Repetition
    max_repeats = max([sum(1 for _ in g) for _, g in itertools.groupby(text)] or [0])
    
    # Punctuation ratio
    punct_count = sum(1 for c in text if c in '.,!?')
    punct_ratio = punct_count / max(1, char_length)
    
    return [entropy, is_unknown, word_count, common_word_ratio, char_length, max_repeats, punct_ratio]

# Load labeled product review dataset
product_df = pd.read_csv('product_reviews_labeled.csv')  # Assume columns: 'review', '

# Extract features
X = np.array([extract_features(review) for review in product_df['review']])
y = product_df['is_gibberish'].values  # 1 = gibberish, 0 = not gibberish

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Apply to course reviews (merged_df)
course_features = np.array([extract_features(review) for review in merged_df['reviews']])
course_predictions = clf.predict(course_features)
merged_df['is_gibberish'] = course_predictions

# Save results
merged_df.to_parquet('course_reviews_classified.parquet')
print("Course reviews classified and saved.")
''''