In [9]:
# ========= 0) Write a tiny dataset inline (no downloads) =========
import pathlib, textwrap, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Detect project root (assumes this notebook is inside ./notebooks)
cwd = pathlib.Path.cwd()
project_root = cwd.parents[0] if cwd.name == "notebooks" else cwd
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)

csv_text = textwrap.dedent("""
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,Braund,male,22,1,0,A/5 21171,7.25,,S
2,1,1,Cumings,female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,Heikkinen,female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,Futrelle,female,35,1,0,113803,53.1,C123,S
5,0,3,Allen,male,35,0,0,373450,8.05,,S
6,0,3,Moran,male,,0,0,330877,8.4583,,Q
7,0,1,McCarthy,male,54,0,0,17463,51.8625,E46,S
8,0,3,Palsson,male,2,3,1,349909,21.075,,S
9,1,3,Johnson,female,27,0,2,347742,11.1333,,S
10,1,2,Nasser,female,14,1,0,237736,30.0708,,C
11,1,3,Sandstrom,female,4,1,1,PP 9549,16.7,G6,S
12,1,1,Bonnell,female,58,0,0,113783,26.55,C103,S
13,0,3,Saundercock,male,20,0,0,A/5. 2151,8.05,,S
14,0,3,Andersson,male,39,1,5,347082,31.275,,S
15,0,3,Vestrom,female,14,0,0,350406,7.8542,,S
16,1,2,Hewlett,female,55,0,0,248706,16,,S
17,0,3,Rice,male,2,4,1,382652,29.125,,Q
18,1,2,Williams,female,,0,0,244373,13,,S
19,0,3,Vander Planke,female,31,1,0,345763,18,,S
20,1,3,Masselmani,female,,0,0,2649,7.225,,C
""").strip()

(data_dir / "sample.csv").write_text(csv_text, encoding="utf-8")
print("Wrote dataset to:", data_dir / "sample.csv")

# ========= 1) Load and clean safely =========
df = pd.read_csv(data_dir / "sample.csv")

# Drop ID-like or high-missing fields to simplify
df = df.drop(columns=[c for c in ["PassengerId","Name","Ticket","Cabin"] if c in df.columns], errors="ignore")

# Coerce numeric-looking columns to numeric and impute
for c in df.columns:
    if c not in ["Sex","Embarked","Survived"]:  # likely numeric
        df[c] = pd.to_numeric(df[c], errors="coerce")
# Fill numeric with median
for c in df.select_dtypes(include=["number"]).columns:
    df[c] = df[c].fillna(df[c].median())
# Fill categorical with mode
for c in df.select_dtypes(include=["object","category"]).columns:
    df[c] = df[c].fillna(df[c].mode().iloc[0])

# ========= 2) Split features/target =========
target = "Survived"
X = df.drop(columns=[target])
y = df[target]

num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category","bool"]).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# ========= 3) Pipeline (robust preprocessing) =========
pre = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)

clf = Pipeline(steps=[
    ("prep", pre),
    ("logreg", LogisticRegression(max_iter=2000))
])

# ========= 4) Train/test split, fit, evaluate =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y if y.nunique() > 1 else None
)

clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print("Accuracy:", round(accuracy_score(y_test, pred), 4))
print(classification_report(y_test, pred, zero_division=0))


Wrote dataset to: C:\Users\DELL\data\sample.csv
Numeric columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical columns: ['Sex', 'Embarked']
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.50      0.67      0.57         3

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.49         6
weighted avg       0.50      0.50      0.49         6

