# SN_HW3

```Networks``` folder should be near this notebook

## Imports & Setup

In [None]:
import os
import json
import zipfile
from pathlib import Path
from typing import Dict, Tuple, List, Set, Optional
import random
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# plt.rcParams["figure.dpi"] = 100

# RANDOM_SEED = 42
# random.seed(RANDOM_SEED)
# np.random.seed(RANDOM_SEED)


BASE_DIR = Path(".").resolve()

OUTPUT_DIR = BASE_DIR / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

NETWORKS_DIR = BASE_DIR / "Networks"


## Q2

### Locate

In [None]:
OUTPUT_DIR_Q2 = OUTPUT_DIR / 'Q2'
OUTPUT_DIR_Q2.mkdir(parents=True, exist_ok=True)

PART_B_DIR = NETWORKS_DIR / 'Part_B'

print('OUTPUT_DIR_Q2:', OUTPUT_DIR_Q2)
print('NETWORKS_DIR:', NETWORKS_DIR)
print('PART_B_DIR:', PART_B_DIR)
print('Part_B files (sample):', sorted([p.name for p in PART_B_DIR.glob('*.csv')])[:10])


### Helper Functions

In [None]:
def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [str(c).strip().lower() for c in df.columns]
    return df

def _detect_id_column(df: pd.DataFrame) -> str:
    for cand in ['id', 'student_id', 'student', 'node', 'uid']:
        if cand in df.columns:
            return cand
    raise ValueError(f'Could not find an ID column. Available columns: {df.columns.tolist()}')

def _detect_edge_columns(df: pd.DataFrame) -> Tuple[str, str]:
    cols = df.columns.tolist()
    for a, b in [('student_1','student_2'), ('u','v'), ('i','j'), ('node_1','node_2'), ('node_i','node_j')]:
        if a in cols and b in cols:
            return a, b
    if len(cols) >= 2:
        return cols[0], cols[1]
    raise ValueError(f'Connection file has <2 columns: {cols}')

def load_snapshot(day_token: str) -> Tuple[nx.Graph, pd.DataFrame]:
    conn_path = PART_B_DIR / f'connections_{day_token}.csv'
    prop_path = PART_B_DIR / f'properties_{day_token}.csv'

    if not conn_path.exists() or not prop_path.exists():
        raise FileNotFoundError(f'Missing files for {day_token}: {conn_path.name}, {prop_path.name}')

    df_props = _normalize_columns(pd.read_csv(prop_path))
    id_col = _detect_id_column(df_props)
    df_props = df_props.set_index(id_col).sort_index()

    df_conn = _normalize_columns(pd.read_csv(conn_path))
    u_col, v_col = _detect_edge_columns(df_conn)

    G = nx.Graph()
    G.add_nodes_from(df_props.index.tolist())

    for node_id, row in df_props.iterrows():
        G.nodes[node_id].update(row.to_dict())

    edges = list(zip(df_conn[u_col].tolist(), df_conn[v_col].tolist()))
    G.add_edges_from(edges)

    for u, v in edges:
        if u not in G:
            G.add_node(u)
        if v not in G:
            G.add_node(v)

    return G, df_props

def discover_day_tokens() -> List[str]:
    conn_files = list(PART_B_DIR.glob('connections_*.csv'))
    tokens = []
    for p in conn_files:
        token = p.stem.replace('connections_', '')
        # require matching properties file
        if (PART_B_DIR / f'properties_{token}.csv').exists():
            tokens.append(token)

    def sort_key(tok: str):
        nums = [int(x) for x in tok.replace('-', '_').split('_') if x.isdigit()]
        return nums[0] if nums else 10**9

    tokens = sorted(set(tokens), key=sort_key)
    return tokens

def safe_binary_series(s: pd.Series) -> pd.Series:
    if s.dropna().isin([0,1,0.0,1.0,True,False]).all():
        return s.astype(int)
    return s

def save_df(df: pd.DataFrame, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    print(f"***file saved*** => {path}")



### Load All Time Steps

In [None]:
day_tokens = discover_day_tokens()
if not day_tokens:
    raise FileNotFoundError('No snapshots found in Part_B (expected connections_*.csv and properties_*.csv).')

data: Dict[str, Dict[str, object]] = {}
for tok in day_tokens:
    G, df = load_snapshot(tok)
    # normalize typical columns if present
    for col in df.columns:
        if col in ['smokes', 'club', 'plays_football', 'watches_movies']:
            df[col] = safe_binary_series(df[col])
    data[tok] = {'G': G, 'df': df}
    print(f'Loaded {tok}: |V|={G.number_of_nodes()}, |E|={G.number_of_edges()}, props_cols={len(df.columns)}')

print('Snapshots:', day_tokens)


### Part 1 — Feature Distribution & Evolution

In [None]:
# ---- 1) Smoking evolution + study comparison ----
stats_rows = []

# Detect main columns (try common names)
def pick_column(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    for c in candidates:
        if c in df.columns:
            return c
    return None

for tok in day_tokens:
    df = data[tok]['df']
    smokes_col = pick_column(df, ['smokes', 'smoker', 'smoking'])
    studies_col = pick_column(df, ['studies', 'study', 'study_level', 'hours_study', 'study_hours'])

    if smokes_col is None:
        raise ValueError(f"Column for smoking not found in properties_{tok}.csv")

    smokes = safe_binary_series(df[smokes_col])
    num_smokers = int(smokes.sum())
    pct_smokers = float(num_smokers / len(df) * 100.0)

    avg_studies_smokers = np.nan
    avg_studies_non_smokers = np.nan
    if studies_col is not None:
        avg_studies_smokers = float(df.loc[smokes == 1, studies_col].mean())
        avg_studies_non_smokers = float(df.loc[smokes == 0, studies_col].mean())

    stats_rows.append({
        'snapshot': tok,
        'n_students': int(len(df)),
        'num_smokers': num_smokers,
        'pct_smokers': pct_smokers,
        'avg_studies_smokers': avg_studies_smokers,
        'avg_studies_non_smokers': avg_studies_non_smokers,
    })

df_stats = pd.DataFrame(stats_rows)
save_df(df_stats,OUTPUT_DIR_Q2 / 'smoker_evolution_stats.csv')

# Plot smoker count
plt.figure(figsize=(8, 4))
plt.plot(df_stats['snapshot'], df_stats['num_smokers'], marker='o')
plt.title('Number of Smokers Over Time')
plt.xlabel('Snapshot')
plt.ylabel('Count')
plt.grid(True)
plt.tight_layout()
plt.savefig(OUTPUT_DIR_Q2 / 'plot_smoker_count.png', dpi=200)
print(f"***file saved*** => {(OUTPUT_DIR_Q2 / 'plot_smoker_count.png')}")
# plt.show()
plt.close()

if not df_stats['avg_studies_smokers'].isna().all():
    plt.figure(figsize=(8, 4))
    plt.plot(df_stats['snapshot'], df_stats['avg_studies_smokers'], marker='o', label='Smokers')
    plt.plot(df_stats['snapshot'], df_stats['avg_studies_non_smokers'], marker='s', label='Non-Smokers')
    plt.title('Average Study Level: Smokers vs Non-Smokers')
    plt.xlabel('Snapshot')
    plt.ylabel('Study (mean)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR_Q2 / 'plot_study_habits.png', dpi=200)
    print(f"***file saved*** => {(OUTPUT_DIR_Q2 / 'plot_study_habits.png')}")
    # plt.show()
    plt.close()
else:
    print('Study column not found; skipping study comparison plot.')


### Part 2 — Closure Mechanisms (Triadic, Membership, Focal)


In [None]:
def choose_membership_column(df: pd.DataFrame) -> Optional[str]:
    # Prefer class-like membership
    for c in ['class_number', 'class', 'classid', 'grade', 'room', 'section']:
        if c in df.columns:
            return c
    return None

def choose_interest_columns(df: pd.DataFrame) -> List[str]:
    exclude = {'smokes', 'smoker', 'smoking', 'gender', 'sex', 'class_number', 'class', 'studies', 'study', 'study_level'}
    cols = []
    for c in df.columns:
        if c in exclude:
            continue
        s = df[c]
        if s.dropna().isin([0,1,0.0,1.0,True,False]).all():
            cols.append(c)
    for c in ['plays_football', 'watches_movies', 'club']:
        if c in df.columns and c not in cols:
            cols.append(c)
    return cols

def edge_is_triadic(G_prev: nx.Graph, u, v) -> bool:
    if u not in G_prev or v not in G_prev:
        return False
    Nu = set(G_prev.neighbors(u))
    Nv = set(G_prev.neighbors(v))
    return len(Nu & Nv) > 0

def edge_is_membership(df_prev: pd.DataFrame, u, v, membership_col: str) -> bool:
    if membership_col is None:
        return False
    if u not in df_prev.index or v not in df_prev.index:
        return False
    return df_prev.loc[u, membership_col] == df_prev.loc[v, membership_col]

def edge_is_focal(df_prev: pd.DataFrame, u, v, interest_cols: List[str]) -> bool:
    if not interest_cols:
        return False
    if u not in df_prev.index or v not in df_prev.index:
        return False
    for c in interest_cols:
        try:
            if int(df_prev.loc[u, c]) == 1 and int(df_prev.loc[v, c]) == 1:
                return True
        except Exception:
            continue
    return False

closure_rows = []

for i in range(len(day_tokens) - 1):
    t_prev = day_tokens[i]
    t_curr = day_tokens[i + 1]

    G_prev = data[t_prev]['G']
    G_curr = data[t_curr]['G']
    df_prev = data[t_prev]['df']

    prev_edges = set(map(tuple, map(sorted, G_prev.edges())))
    curr_edges = set(map(tuple, map(sorted, G_curr.edges())))

    new_edges = [e for e in curr_edges if e not in prev_edges]

    # Columns
    smokes_col = None
    for c in ['smokes', 'smoker', 'smoking']:
        if c in df_prev.columns:
            smokes_col = c
            break
    if smokes_col is None:
        raise ValueError(f"Smoking column not found at {t_prev}")

    membership_col = choose_membership_column(df_prev)
    interest_cols = choose_interest_columns(df_prev)

    triadic_count = 0
    membership_count = 0
    focal_count = 0

    smoker_involved_count = 0
    smoker_triadic = 0
    smoker_focal = 0

    for u, v in new_edges:
        is_triadic = edge_is_triadic(G_prev, u, v)
        is_membership = edge_is_membership(df_prev, u, v, membership_col) if membership_col else False
        is_focal = edge_is_focal(df_prev, u, v, interest_cols)

        if is_triadic:
            triadic_count += 1
        if is_membership:
            membership_count += 1
        if is_focal:
            focal_count += 1

        u_sm = int(df_prev.loc[u, smokes_col]) if u in df_prev.index else 0
        v_sm = int(df_prev.loc[v, smokes_col]) if v in df_prev.index else 0
        smoker_involved = (u_sm == 1) or (v_sm == 1)
        if smoker_involved:
            smoker_involved_count += 1
            if is_triadic:
                smoker_triadic += 1
            if is_focal:
                smoker_focal += 1

    n_new = len(new_edges)
    closure_rows.append({
        'transition': f'{t_prev} -> {t_curr}',
        'new_edges': n_new,
        'triadic_closure_cases': triadic_count,
        'membership_closure_cases': membership_count,
        'focal_closure_cases': focal_count,
        'membership_col_used': membership_col if membership_col else '',
        'interest_cols_used': ','.join(interest_cols),
        'smoker_involved_edges': smoker_involved_count,
        'smoker_involved_triadic': smoker_triadic,
        'smoker_involved_focal': smoker_focal,
    })

df_closure = pd.DataFrame(closure_rows)
save_df(df_closure,OUTPUT_DIR_Q2 / 'closure_mechanisms.csv')


### Part 3 — New Smoker Analysis

In [None]:
def get_smokes_col(df: pd.DataFrame) -> str:
    for c in ['smokes', 'smoker', 'smoking']:
        if c in df.columns:
            return c
    raise ValueError('Smoking column not found.')

def summarize_new_smokers(t_prev: str, t_curr: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df_prev = data[t_prev]['df']
    df_curr = data[t_curr]['df']
    G_prev = data[t_prev]['G']

    sm_col_prev = get_smokes_col(df_prev)
    sm_col_curr = get_smokes_col(df_curr)

    common_ids = df_prev.index.intersection(df_curr.index)
    prev_sm = safe_binary_series(df_prev.loc[common_ids, sm_col_prev])
    curr_sm = safe_binary_series(df_curr.loc[common_ids, sm_col_curr])

    new_smoker_ids = common_ids[(prev_sm == 0) & (curr_sm == 1)].tolist()
    stayed_nonsmoker_ids = common_ids[(prev_sm == 0) & (curr_sm == 0)].tolist()

    candidate_cols = [c for c in ['studies','study','study_level','club','gender','plays_football','watches_movies','class_number','class'] if c in df_prev.columns]

    rows = []
    for uid in new_smoker_ids:
        friends = list(G_prev.neighbors(uid)) if uid in G_prev else []
        num_friends = len(friends)

        smoker_friends = 0
        for f in friends:
            if f in df_prev.index:
                smoker_friends += int(df_prev.loc[f, sm_col_prev]) == 1
        pct_smoker_friends = (smoker_friends / num_friends * 100.0) if num_friends > 0 else np.nan

        rec = {
            'transition': f'{t_prev} -> {t_curr}',
            'student_id': uid,
            'num_friends_prev': num_friends,
            'num_smoker_friends_prev': int(smoker_friends),
            'pct_smoker_friends_prev': float(pct_smoker_friends) if not np.isnan(pct_smoker_friends) else np.nan,
        }
        for c in candidate_cols:
            rec[c] = df_prev.loc[uid, c] if uid in df_prev.index else np.nan
        rows.append(rec)

    df_new = pd.DataFrame(rows)

    base_rows = []
    for uid in stayed_nonsmoker_ids:
        friends = list(G_prev.neighbors(uid)) if uid in G_prev else []
        num_friends = len(friends)
        smoker_friends = 0
        for f in friends:
            if f in df_prev.index:
                smoker_friends += int(df_prev.loc[f, sm_col_prev]) == 1
        pct_smoker_friends = (smoker_friends / num_friends * 100.0) if num_friends > 0 else np.nan
        base_rows.append({
            'transition': f'{t_prev} -> {t_curr}',
            'student_id': uid,
            'num_friends_prev': num_friends,
            'num_smoker_friends_prev': int(smoker_friends),
            'pct_smoker_friends_prev': float(pct_smoker_friends) if not np.isnan(pct_smoker_friends) else np.nan,
        })

    df_base = pd.DataFrame(base_rows)
    return df_new, df_base

all_new = []
all_base = []
for i in range(len(day_tokens) - 1):
    t_prev, t_curr = day_tokens[i], day_tokens[i+1]
    df_new, df_base = summarize_new_smokers(t_prev, t_curr)
    all_new.append(df_new)
    all_base.append(df_base)

df_new_smokers = pd.concat(all_new, ignore_index=True) if all_new else pd.DataFrame()
df_baseline = pd.concat(all_base, ignore_index=True) if all_base else pd.DataFrame()

save_df(df_new_smokers,OUTPUT_DIR_Q2 / 'new_smokers_analysis.csv')
save_df(df_baseline,OUTPUT_DIR_Q2 / 'baseline_nonsmokers_sample.csv')

print('New smokers rows:', len(df_new_smokers))

### Part 4 — Centrality (Final Snapshot)

In [None]:
final_tok = day_tokens[-1]
G_final = data[final_tok]['G']
df_final = data[final_tok]['df']

deg_cent = nx.degree_centrality(G_final)
top5 = sorted(deg_cent.items(), key=lambda x: x[1], reverse=True)[:5]

# Pick a few attributes to export (if exist)
attr_cols = [c for c in ['smokes','studies','study','study_level','club','gender','class_number','class'] if c in df_final.columns]

rows = []
for uid, score in top5:
    rec = {'student_id': uid, 'degree_centrality': float(score), 'degree': int(G_final.degree(uid))}
    if uid in df_final.index:
        for c in attr_cols:
            rec[c] = df_final.loc[uid, c]
    rows.append(rec)

df_top5 = pd.DataFrame(rows)
save_df(df_top5,OUTPUT_DIR_Q2 / f'top_5_centrality_{final_tok}.csv')