# FamilySearch Android App - Sibling Feature Analysis

Analysis of Google Play Store reviews to identify customer demand for viewing ancestor siblings in pedigree view.

**Data**: 36 CSV files (Jan 2023 - Dec 2025) from `data/feedback/android/`

In [None]:
import pandas as pd
import glob
import numpy as np
from pathlib import Path

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

## 1. Load and Explore Data

In [None]:
# Load all CSV files
csv_files = sorted(glob.glob('../data/feedback/android/*.csv'))
print(f"Found {len(csv_files)} CSV files")
print(f"Date range: {Path(csv_files[0]).stem.split('_')[-1]} to {Path(csv_files[-1]).stem.split('_')[-1]}")

In [None]:
# Load a single file to examine structure
# Try different encodings to handle the file properly
sample_df = pd.read_csv(csv_files[0], encoding='utf-16')
print(f"Shape: {sample_df.shape}")
print(f"\nColumns:\n{sample_df.columns.tolist()}")
sample_df.head(3)

In [None]:
# Load all files into a single dataframe
dfs = []
for file in csv_files:
    try:
        df = pd.read_csv(file, encoding='utf-16')
        dfs.append(df)
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Combine all dataframes
all_reviews = pd.concat(dfs, ignore_index=True)
print(f"\nTotal reviews loaded: {len(all_reviews):,}")
print(f"Date range: {all_reviews['Review Submit Date and Time'].min()} to {all_reviews['Review Submit Date and Time'].max()}")

In [None]:
# Basic statistics
print("Review Text Statistics:")
print(f"Total reviews: {len(all_reviews):,}")
print(f"Reviews with text: {all_reviews['Review Text'].notna().sum():,}")
print(f"Reviews without text: {all_reviews['Review Text'].isna().sum():,}")
print(f"\nLanguage distribution:")
print(all_reviews['Reviewer Language'].value_counts().head(10))

## 2. Filter for Reviews with Text Content

In [None]:
# Filter for reviews that have actual text content
reviews_with_text = all_reviews[all_reviews['Review Text'].notna() & (all_reviews['Review Text'].str.strip() != '')].copy()
print(f"Reviews with text content: {len(reviews_with_text):,}")
print(f"Percentage with text: {len(reviews_with_text)/len(all_reviews)*100:.1f}%")

## 3. Search for Sibling-Related Mentions

Search for keywords related to siblings, brothers, sisters, and related family relationships in ancestor context.

In [None]:
# Define search terms for sibling-related mentions
# Include English and common variations
sibling_keywords = [
    'sibling', 'siblings',
    'brother', 'brothers', 'sister', 'sisters',
    'aunt', 'aunts', 'uncle', 'uncles',
    'nephew', 'nephews', 'niece', 'nieces',
    'cousin', 'cousins'
]

# Create search pattern (case insensitive)
pattern = '|'.join(sibling_keywords)
print(f"Search pattern: {pattern}")

In [None]:
# Search for sibling mentions in review text
sibling_mentions = reviews_with_text[reviews_with_text['Review Text'].str.contains(pattern, case=False, na=False)].copy()
print(f"\nReviews mentioning siblings/related family: {len(sibling_mentions):,}")
print(f"Percentage of all reviews: {len(sibling_mentions)/len(all_reviews)*100:.2f}%")
print(f"Percentage of reviews with text: {len(sibling_mentions)/len(reviews_with_text)*100:.2f}%")

In [None]:
# Display sample of sibling-related reviews
print("Sample reviews mentioning siblings/family:")
print("="*80)
for idx, row in sibling_mentions.head(10).iterrows():
    print(f"\nDate: {row['Review Submit Date and Time']}")
    print(f"Rating: {row['Star Rating']} stars")
    print(f"Language: {row['Reviewer Language']}")
    print(f"Review: {row['Review Text']}")
    print("-"*80)

## 4. Export Results

In [None]:
# Save sibling-related reviews to CSV for further analysis
output_file = '../data/sibling_mentions.csv'
sibling_mentions.to_csv(output_file, index=False, encoding='utf-8')
print(f"Saved {len(sibling_mentions)} reviews to {output_file}")