# 1000 Genomes Chromosome 22 Variant Enrichment Analysis

This notebook downloads chromosome 22 VCF data from the 1000 Genomes Project,
extracts the first 10,000 variants, and performs a Chi-square test to evaluate
if any variants are significantly enriched in allele frequency in one continental
population versus others.

In [None]:
# Install necessary packages
!pip install cyvcf2 pandas scipy tqdm --quiet

In [None]:
from cyvcf2 import VCF
import pandas as pd
from scipy.stats import chi2_contingency
from tqdm import tqdm
import os, gzip, shutil

# Download VCF for chromosome 22
vcf_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
vcf_file = "ALL.chr22.phase3_shapeit2.vcf.gz"

if not os.path.exists(vcf_file):
    !wget -O {vcf_file} {vcf_url}

In [None]:
# Load population metadata
sample_info_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.txt"
sample_info_file = "20130606_sample_info.txt"

if not os.path.exists(sample_info_file):
    !wget -O {sample_info_file} {sample_info_url}

sample_df = pd.read_csv(sample_info_file, sep='\t')
pop_map = sample_df.set_index('Sample')['Super Population'].to_dict()

In [None]:
# Initialize VCF parser
vcf = VCF(vcf_file)
samples = vcf.samples

# Map samples to super populations
sample_pops = [pop_map.get(s, None) for s in samples]
pop_set = sorted(set([p for p in sample_pops if p]))

# Count enriched variants
from collections import defaultdict
sig_counts = defaultdict(int)

N = 10000
for i, variant in enumerate(tqdm(vcf, total=N)):
    if i >= N:
        break
    
    genos = variant.genotypes  # List of tuples: (GT1, GT2, phased)
    if not genos:
        continue
    
    # Build contingency table
    pop_alleles = {p: [0, 0] for p in pop_set}  # ALT, REF
    for geno, pop in zip(genos, sample_pops):
        if pop not in pop_alleles:
            continue
        alt_count = geno[0] + geno[1] if geno[0] != -1 and geno[1] != -1 else 0
        ref_count = 2 - alt_count if alt_count <= 2 else 0
        pop_alleles[pop][0] += alt_count
        pop_alleles[pop][1] += ref_count

    # Create 2xN table: rows = [ALT, REF]; columns = populations
    table = [[], []]
    for p in pop_set:
        table[0].append(pop_alleles[p][0])
        table[1].append(pop_alleles[p][1])

    try:
        chi2, pval, dof, ex = chi2_contingency(table)
        if pval < 0.01:
            for p in pop_set:
                if pop_alleles[p][0] > sum([pop_alleles[x][0] for x in pop_set if x != p]) / (len(pop_set) - 1):
                    sig_counts[p] += 1
    except Exception:
        continue

In [None]:
# Summary table
summary_df = pd.DataFrame.from_dict(sig_counts, orient='index', columns=['Significant Variants'])
summary_df = summary_df.sort_values(by='Significant Variants', ascending=False)
summary_df