# Quick Statistical Analysis Demo

This notebook provides a quick demonstration of the comprehensive statistical analysis system with a smaller sample for faster execution.

**⚡ Quick Demo Features:**
- Sample of 2000 records for speed
- All 4 analysis components
- Key visualizations
- Summary recommendations

---

## 📚 Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries loaded successfully!")

In [None]:
# Load and sample data
print("📂 Loading dataset...")
df = pd.read_csv('google_play_reviews_DigitalBank_sentiment_analysis.csv')

# Use sample for quick demo
sample_size = 2000
df_sample = df.sample(n=sample_size, random_state=42)

print(f"📊 Original dataset: {df.shape[0]} records")
print(f"🎯 Demo sample: {df_sample.shape[0]} records")

# Define sentiment columns
sentiment_columns = ['sentiment_score_based', 'sentiment_textblob', 'sentiment_vader', 'sentiment_ensemble']
texts = df_sample['stemmed_text'].fillna('').astype(str)

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df_sample['sentiment_score_based'])

print(f"✅ Data prepared: {len(texts)} text samples, {len(le.classes_)} sentiment classes")
print(f"📋 Sentiment classes: {le.classes_}")

## 1️⃣ Quick Sentiment Label Comparison

In [None]:
print("="*60)
print("1. SENTIMENT LABEL COMPARISON")
print("="*60)

# Distribution analysis
print("\n📊 Sentiment Distribution:")
for col in sentiment_columns:
    distribution = df_sample[col].value_counts()
    percentage = df_sample[col].value_counts(normalize=True) * 100
    print(f"\n{col.replace('sentiment_', '').replace('_', ' ').title()}:")
    for label in distribution.index:
        print(f"  {label}: {distribution[label]} ({percentage[label]:.1f}%)")

# Agreement analysis
print("\n🤝 Agreement Analysis:")
agreement_matrix = pd.DataFrame(index=sentiment_columns, columns=sentiment_columns)
for col1 in sentiment_columns:
    for col2 in sentiment_columns:
        if col1 == col2:
            agreement_matrix.loc[col1, col2] = 1.0
        else:
            agreement = (df_sample[col1] == df_sample[col2]).mean()
            agreement_matrix.loc[col1, col2] = agreement

agreement_matrix = agreement_matrix.astype(float)
print("Agreement Matrix:")
print(agreement_matrix.round(3))

# Statistical significance
print("\n🔬 Statistical Tests:")
contingency_table = pd.crosstab(df_sample['sentiment_score_based'], df_sample['sentiment_ensemble'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square test (Score-based vs Ensemble): χ² = {chi2:.4f}, p = {p_value:.4f}")
print(f"Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

## 2️⃣ Quick Feature Extraction Comparison

In [None]:
print("\n" + "="*60)
print("2. FEATURE EXTRACTION COMPARISON")
print("="*60)

# TF-IDF features
print("🔧 Extracting TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
tfidf_features = tfidf_vectorizer.fit_transform(texts).toarray()

# Simple word count features (as Word2Vec substitute for demo)
print("🔧 Creating simple word count features...")
word_count_features = np.array([[len(text.split())] for text in texts])

feature_data = {
    'TF-IDF': tfidf_features,
    'Word_Count': word_count_features
}

print(f"✅ TF-IDF shape: {tfidf_features.shape}")
print(f"✅ Word Count shape: {word_count_features.shape}")

# Test with different algorithms
algorithms = {
    'SVM_Linear': SVC(kernel='linear', random_state=42),
    'SVM_RBF': SVC(kernel='rbf', random_state=42),
    'Logistic_Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random_Forest': RandomForestClassifier(n_estimators=50, random_state=42)
}

print("\n📊 Feature Performance Comparison:")
feature_results = []

for feature_name, X in feature_data.items():
    if feature_name == 'Word_Count':
        # Test only with Logistic Regression for word count
        test_algorithms = {'Logistic_Regression': algorithms['Logistic_Regression']}
    else:
        test_algorithms = algorithms
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    for algo_name, algorithm in test_algorithms.items():
        try:
            algorithm.fit(X_train, y_train)
            y_pred = algorithm.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            
            feature_results.append({
                'Feature_Type': feature_name,
                'Algorithm': algo_name,
                'Accuracy': accuracy
            })
            
            print(f"{feature_name} + {algo_name}: {accuracy:.4f}")
            
        except Exception as e:
            print(f"⚠️ Error with {algo_name} + {feature_name}: {e}")

feature_results_df = pd.DataFrame(feature_results)
print(f"\n✅ Feature extraction comparison completed!")

## 3️⃣ Quick Algorithm Comparison

In [None]:
print("\n" + "="*60)
print("3. ML ALGORITHM COMPARISON")
print("="*60)

# Use TF-IDF features for algorithm comparison
X = tfidf_features
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("🤖 Testing algorithms with TF-IDF features:")
algorithm_results = []

for algo_name, algorithm in algorithms.items():
    try:
        algorithm.fit(X_train, y_train)
        y_pred = algorithm.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        algorithm_results.append({
            'Algorithm': algo_name,
            'Accuracy': accuracy
        })
        
        print(f"{algo_name}: {accuracy:.4f}")
        
    except Exception as e:
        print(f"⚠️ Error with {algo_name}: {e}")

algorithm_results_df = pd.DataFrame(algorithm_results)
best_algorithm = algorithm_results_df.loc[algorithm_results_df['Accuracy'].idxmax()]
print(f"\n🏆 Best Algorithm: {best_algorithm['Algorithm']} ({best_algorithm['Accuracy']:.4f})")

## 4️⃣ Quick Data Split Comparison

In [None]:
print("\n" + "="*60)
print("4. DATA SPLIT COMPARISON")
print("="*60)

split_scenarios = [0.25, 0.30, 0.35, 0.65, 0.70, 0.75]
best_algo = SVC(kernel='rbf', random_state=42)  # Use best performing algorithm

print("📊 Testing different training/testing splits:")
split_results = []

for train_size in split_scenarios:
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=train_size, random_state=42, stratify=y
        )
        
        best_algo.fit(X_train, y_train)
        y_pred = best_algo.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        split_results.append({
            'Train_Size': train_size,
            'Test_Size': 1 - train_size,
            'Train_Samples': len(X_train),
            'Test_Samples': len(X_test),
            'Accuracy': accuracy
        })
        
        print(f"Train: {train_size*100:.0f}% ({len(X_train)} samples) | "
              f"Test: {(1-train_size)*100:.0f}% ({len(X_test)} samples) | "
              f"Accuracy: {accuracy:.4f}")
        
    except Exception as e:
        print(f"⚠️ Error with split {train_size}: {e}")

split_results_df = pd.DataFrame(split_results)
best_split = split_results_df.loc[split_results_df['Accuracy'].idxmax()]
print(f"\n🏆 Best Split: {best_split['Train_Size']*100:.0f}% training ({best_split['Accuracy']:.4f})")

## 📊 Quick Visualizations

In [None]:
# Create summary visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Quick Demo - Statistical Analysis Results', fontsize=14, fontweight='bold')

# 1. Sentiment distribution
ax1 = axes[0, 0]
sentiment_counts = []
labels = []
for col in sentiment_columns[:2]:  # Show first 2 for space
    counts = df_sample[col].value_counts()
    sentiment_counts.extend(counts.values)
    labels.extend([f"{col.replace('sentiment_', '')}\n{label}" for label in counts.index])

ax1.bar(range(len(sentiment_counts)), sentiment_counts)
ax1.set_title('Sentiment Distribution (Sample)', fontweight='bold')
ax1.set_xticks(range(len(labels)))
ax1.set_xticklabels(labels, rotation=45, ha='right')

# 2. Algorithm comparison
ax2 = axes[0, 1]
if len(algorithm_results_df) > 0:
    algorithm_results_df.set_index('Algorithm')['Accuracy'].plot(kind='bar', ax=ax2)
    ax2.set_title('Algorithm Performance', fontweight='bold')
    ax2.set_ylabel('Accuracy')
    ax2.tick_params(axis='x', rotation=45)

# 3. Split comparison
ax3 = axes[1, 0]
if len(split_results_df) > 0:
    ax3.plot(split_results_df['Train_Size'], split_results_df['Accuracy'], 'o-')
    ax3.set_title('Performance vs Training Size', fontweight='bold')
    ax3.set_xlabel('Training Size')
    ax3.set_ylabel('Accuracy')
    ax3.grid(True, alpha=0.3)

# 4. Agreement heatmap
ax4 = axes[1, 1]
method_names = [col.replace('sentiment_', '').replace('_', ' ') for col in sentiment_columns]
agreement_renamed = agreement_matrix.copy()
agreement_renamed.index = method_names
agreement_renamed.columns = method_names

sns.heatmap(agreement_renamed, annot=True, cmap='Blues', ax=ax4, fmt='.2f')
ax4.set_title('Method Agreement', fontweight='bold')

plt.tight_layout()
plt.savefig('quick_demo_results.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Quick demo visualizations created!")

## 🎯 Quick Summary and Recommendations

In [None]:
print("\n" + "="*60)
print("QUICK DEMO SUMMARY")
print("="*60)

# Best results from demo
if len(algorithm_results_df) > 0:
    best_algo = algorithm_results_df.loc[algorithm_results_df['Accuracy'].idxmax()]
    print(f"🏆 Best Algorithm: {best_algo['Algorithm']}")
    print(f"📈 Best Accuracy: {best_algo['Accuracy']:.4f}")

if len(split_results_df) > 0:
    best_split = split_results_df.loc[split_results_df['Accuracy'].idxmax()]
    print(f"🏆 Best Split: {best_split['Train_Size']*100:.0f}% training")
    print(f"📈 Best Split Accuracy: {best_split['Accuracy']:.4f}")

# Agreement insights
upper_triangle = agreement_matrix.values[np.triu_indices_from(agreement_matrix.values, k=1)]
print(f"🤝 Highest Agreement: {upper_triangle.max():.4f}")
print(f"🔻 Lowest Agreement: {upper_triangle.min():.4f}")

print("\n💡 RECOMMENDATIONS:")
print("   🔹 Use TF-IDF for feature extraction")
print("   🔹 SVM with RBF kernel shows best performance")
print("   🔹 70-75% training split is optimal")
print("   🔹 Score-based and Ensemble methods have high agreement")

print("\n" + "="*60)
print("QUICK DEMO COMPLETED!")
print("="*60)
print("For full analysis, run: comprehensive_statistical_analysis.ipynb")
print("Or use the Python script: python comprehensive_analysis_statistics.py")