<a href="https://colab.research.google.com/github/sathwikd7/B14_pds_10d7/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, sys, warnings, random
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, roc_auc_score, roc_curve, classification_report)
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Make plots look nicer
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (8,6)

# -------------------------
# 2) Upload dataset (CSV)
# -------------------------
from google.colab import files
print("Please upload your dataset CSV file (the first uploaded file will be used).")
uploaded = files.upload()
if not uploaded:
    raise SystemExit("No file uploaded. Re-run and upload a CSV file.")

# Use the first uploaded file
fn = list(uploaded.keys())[0]
print(f"Using uploaded file: {fn}")

# Read with pandas (try common separators)
try:
    df = pd.read_csv(fn)
except Exception as e:
    # try semicolon
    df = pd.read_csv(fn, sep=';')

print("\n--- Data preview ---")
display(df.head())
print("\nData shape:", df.shape)

# -------------------------
# 3) Auto-detect target column
# -------------------------
possible_targets = ['At_Risk','at_risk','at risk','target','label','y','y_label','is_at_risk','risk']
cols_lower = [c.lower() for c in df.columns]
target_col = None
for t in possible_targets:
    if t.lower() in cols_lower:
        target_col = df.columns[cols_lower.index(t.lower())]
        break
if target_col is None:
    # fallback to 'Y' like columns or the last column
    if 'y' in df.columns:
        target_col = 'y'
    else:
        target_col = df.columns[-1]

print(f"Selected target column: '{target_col}' (if this is wrong, rename target column in your CSV and re-upload).")

# -------------------------
# 4) Basic cleaning & preprocessing
# -------------------------
# Drop fully empty columns
df = df.dropna(axis=1, how='all')

# If target is non-numeric but binary-like, convert
if df[target_col].dtype == object:
    # try map yes/no, true/false, 'AtRisk' text => 1/0
    unique_vals = df[target_col].dropna().unique()[:10]
    mapping = {}
    # common patterns
    for u in unique_vals:
        s = str(u).strip().lower()
        if s in ['yes','y','true','t','1','atrisk','atrisk','risk','positive','excess']:
            mapping[u] = 1
        elif s in ['no','n','false','f','0','healthy','safe','negative','low']:
            mapping[u] = 0
    if mapping:
        df[target_col] = df[target_col].map(mapping).astype(float)
    else:
        # if can't map, label-encode
        df[target_col] = pd.Categorical(df[target_col]).codes

# Drop rows where target is null
df = df[~df[target_col].isna()].reset_index(drop=True)

# Separate features and target
y = df[target_col].astype(int)
X = df.drop(columns=[target_col])

# Quick imputation: numeric -> median, categorical -> mode
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

for c in num_cols:
    if X[c].isna().sum() > 0:
        X[c] = X[c].fillna(X[c].median())
for c in cat_cols:
    if X[c].isna().sum() > 0:
        X[c] = X[c].fillna(X[c].mode().iloc[0])

# One-hot encode categorical features (if any)
if len(cat_cols) > 0:
    X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

print("\nAfter preprocessing, feature shape:", X.shape)

# -------------------------
# 5) Train/Test split & scaling
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------
# 6) Helper: evaluation print
# -------------------------
def evaluate_model(name, y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f"\n---- {name} ----")
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
    print("Classification report:\n", classification_report(y_true, y_pred, zero_division=0))
    if y_proba is not None:
        try:
            auc = roc_auc_score(y_true, y_proba)
            print(f"AUC: {auc:.4f}")
        except:
            pass
    # confusion matrix plot
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# -------------------------
# 7) Train models
# -------------------------

# 7.1 Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_proba_lr = None
try:
    y_proba_lr = lr.predict_proba(X_test_scaled)[:,1]
except:
    pass
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)

# 7.2 SVM (RBF)
svm = SVC(kernel='rbf', probability=True, gamma='scale')
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
y_proba_svm = svm.predict_proba(X_test_scaled)[:,1]
evaluate_model("SVM (RBF)", y_test, y_pred_svm, y_proba_svm)

# 7.3 Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test_scaled)
y_proba_rf = rf.predict_proba(X_test_scaled)[:,1]
evaluate_model("Random Forest", y_test, y_pred_rf, y_proba_rf)

# 7.4 XGBoost
xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xg.fit(X_train_scaled, y_train)
y_pred_xg = xg.predict(X_test_scaled)
y_proba_xg = xg.predict_proba(X_test_scaled)[:,1]
evaluate_model("XGBoost", y_test, y_pred_xg, y_proba_xg)

# 7.5 Simple ANN (Keras)
tf.random.set_seed(42)
ann = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.25),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)

history = ann.fit(X_train_scaled, y_train, validation_split=0.12, epochs=50, batch_size=32, callbacks=[es], verbose=0)
y_proba_ann = ann.predict(X_test_scaled).ravel()
y_pred_ann = (y_proba_ann >= 0.5).astype(int)
evaluate_model("ANN (Keras)", y_test, y_pred_ann, y_proba_ann)

# -------------------------
# 8) ROC curves comparison
# -------------------------
plt.figure()
models_for_roc = {
    "LogisticRegression": (y_proba_lr if y_proba_lr is not None else lr.decision_function(X_test_scaled)),
    "SVM": y_proba_svm,
    "RandomForest": y_proba_rf,
    "XGBoost": y_proba_xg,
    "ANN": y_proba_ann
}
for name, proba in models_for_roc.items():
    try:
        fpr, tpr, _ = roc_curve(y_test, proba)
        auc = roc_auc_score(y_test, proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
    except Exception as e:
        pass

plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()

# -------------------------
# 9) Feature importance (from RandomForest & XGBoost)
# -------------------------
def show_top_feats(model, feature_names, top_n=15, title="Feature importance"):
    try:
        importances = model.feature_importances_
        idx = np.argsort(importances)[::-1][:top_n]
        top_features = [(feature_names[i], importances[i]) for i in idx]
        df_imp = pd.DataFrame(top_features, columns=['feature','importance'])
        display(df_imp)
        plt.figure(figsize=(6,4))
        sns.barplot(x='importance', y='feature', data=df_imp)
        plt.title(title)
        plt.tight_layout()
        plt.show()
    except:
        print("Model has no feature_importances_ attribute.")

print("\nTop features (RandomForest):")
show_top_feats(rf, X.columns, top_n=15, title="RandomForest feature importance")

print("\nTop features (XGBoost):")
show_top_feats(xg, X.columns, top_n=15, title="XGBoost feature importance")

# -------------------------
# 10) K-Means clustering + PCA visualization
# -------------------------
print("\nRunning K-Means clustering (k=3) and PCA visualization.")
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
# use scaled full feature set
X_scaled_full = scaler.transform(X)
clusters = kmeans.fit_predict(X_scaled_full)

sil = silhouette_score(X_scaled_full, clusters)
print("Silhouette score:", sil)

pca = PCA(n_components=2, random_state=42)
pca_points = pca.fit_transform(X_scaled_full)
plt.figure()
sns.scatterplot(x=pca_points[:,0], y=pca_points[:,1], hue=clusters, palette='tab10', legend='full')
plt.title("KMeans clusters (PCA projection)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# show cluster sizes and at-risk percentages
cluster_df = pd.DataFrame({'cluster': clusters, 'target': y})
summary = cluster_df.groupby('cluster').agg(size=('target','size'), at_risk_pct=('target', lambda s: s.mean()*100)).reset_index()
display(summary)

# -------------------------
# 11) Simple ablation study
# -------------------------
print("\nRunning a simple ablation-style check on grouped features (if present).")
# Heuristic groups: temporal, behavioral, environmental based on column name substrings
temporal_keys = [c for c in X.columns if any(k in c.lower() for k in ['time','night','after','pm','duration','hours','sleep'])]
behavioral_keys = [c for c in X.columns if any(k in c.lower() for k in ['screen','device','use','usage','recreat','educat','ratio','dependency','addict','habit'])]
environment_keys = [c for c in X.columns if any(k in c.lower() for k in ['urban','rural','outdoor','house','home','env','store'])]

def ablation_eval(drop_cols):
    if len(drop_cols) == 0:
        return None
    X_ab = X.drop(columns=list(set(drop_cols) & set(X.columns)))
    if X_ab.shape[1] < 1:
        return None
    X_train_ab, X_test_ab, y_train_ab, y_test_ab = train_test_split(X_ab, y, test_size=0.2, random_state=42, stratify=y)
    scaler_ab = StandardScaler()
    X_train_ab = scaler_ab.fit_transform(X_train_ab)
    X_test_ab = scaler_ab.transform(X_test_ab)
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train_ab, y_train_ab)
    ypred = model.predict(X_test_ab)
    return accuracy_score(y_test_ab, ypred)

base_acc = accuracy_score(y_test, y_pred_xg)
print(f"Baseline XGBoost accuracy: {base_acc:.4f}")

for name, keys in [('Temporal', temporal_keys), ('Behavioral', behavioral_keys), ('Environmental', environment_keys)]:
    acc = ablation_eval(keys)
    if acc is not None:
        print(f"Remove {name} ({len(keys)} cols) --> XGBoost accuracy: {acc:.4f}  (Î” = {acc - base_acc:+.4f})")
    else:
        print(f"Remove {name}: no matching columns found or insufficient features to evaluate.")

# -------------------------
# 12) Small Q-Learning RL simulation for parental interventions
# -------------------------
print("\nRunning a small Q-Learning simulation for parental actions (toy model).")
actions = ["Set_Limits", "Co_View", "Digital_Detox", "Parental_Control"]
n_states = 5  # toy states of 'risk level'
Q = np.zeros((n_states, len(actions)))
alpha = 0.1
gamma = 0.95
epsilon = 0.2
def get_reward(state, action_idx):
    # toy reward mapping: Digital Detox best, Set_Limits second
    if action_idx == 2: return 4.0
    if action_idx == 0: return 3.0
    if action_idx == 1: return 2.0
    return 1.0

for episode in range(1000):
    state = random.randint(0, n_states-1)
    for t in range(10):
        if random.random() < epsilon:
            action = random.randint(0, len(actions)-1)
        else:
            action = np.argmax(Q[state])
        r = get_reward(state, action)
        next_state = max(0, min(n_states-1, state + (1 if random.random() < 0.3 else -1)))
        Q[state, action] = Q[state, action] + alpha*(r + gamma*np.max(Q[next_state]) - Q[state, action])
        state = next_state

# show learned Q values (averaged)
q_table = pd.DataFrame(Q, columns=actions)
display(q_table)
best_actions = q_table.idxmax(axis=1)
print("Best action per state (0..4):", list(best_actions))

# -------------------------
# 13) Save key plots to files in runtime (optional)
# -------------------------
out_dir = "results"
os.makedirs(out_dir, exist_ok=True)
plt.figure()
plt.plot(history.history.get('loss', []), label='train_loss')
plt.plot(history.history.get('val_loss', []), label='val_loss')
plt.legend(); plt.title("ANN loss history"); plt.savefig(os.path.join(out_dir,'ann_loss.png')); plt.close()

print("\nAll done. Results (including a few saved plots) are in the 'results' folder in the notebook runtime.")
print("If you'd like, I can: (1) tailor feature selection for your dataset, (2) add LSTM temporal modeling, or (3) export results to a PDF. Tell me which next.")

Please upload your dataset CSV file (the first uploaded file will be used).
