In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1. بارگذاری دیتاست
df = pd.read_csv("titanic.csv")

# 2. جایگزینی مقادیر گمشده
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
df.drop(columns=["Cabin"], inplace=True) 
df["Fare"].fillna(df["Fare"].mean(), inplace=True)

# 3. Feature Engineering: ایجاد ویژگی های جدید
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
df["Title"] = df["Title"].replace(["Mlle", "Ms"], "Miss").replace(["Mme"], "Mrs")
rare_titles = df["Title"].value_counts()[df["Title"].value_counts() < 10].index
df["Title"] = df["Title"].replace(rare_titles, "Other")

# 4. Encoding داده های دسته ای
df = pd.get_dummies(df, columns=["Sex", "Embarked", "Title"], drop_first=True)

# 5. Scaling داده های عددی
scaler = StandardScaler()
df[["Age", "Fare", "FamilySize"]] = scaler.fit_transform(df[["Age", "Fare", "FamilySize"]])

# 6. ویژگی ها و هدف
X = df.drop(columns=["Survived", "PassengerId", "Name", "Ticket"])
y = df["Survived"]

# 7. تقسیم داده به train و test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 8. نمایش نتیجه تغییرات
print("✅ چند ردیف اول دیتاست بعد از Feature Engineering:")
print(df.head())
