In [3]:
# =============================
# 1. Import Libraries
# =============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# =============================
# 2. Load Dataset
# =============================
train = pd.read_csv("train (2).csv")
test = pd.read_csv("test (1).csv")
gender_submission = pd.read_csv("gender_submission.csv")

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)
print(train.head())

# =============================
# 3. Exploratory Data Analysis (EDA)
# =============================
plt.figure(figsize=(6,4))
sns.countplot(data=train, x="Survived", palette="viridis")
plt.title("Survival Count (0=No, 1=Yes)")
plt.savefig("survival_count.png")
plt.show()

plt.figure(figsize=(6,4))
sns.countplot(data=train, x="Sex", hue="Survived", palette="Set2")
plt.title("Survival by Gender")
plt.savefig("survival_by_gender.png")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(train["Age"].dropna(), bins=30, kde=True, color="blue")
plt.title("Age Distribution")
plt.savefig("age_distribution.png")
plt.show()

plt.figure(figsize=(6,4))
sns.countplot(data=train, x="Pclass", hue="Survived", palette="Set1")
plt.title("Survival by Passenger Class")
plt.savefig("survival_by_class.png")
plt.show()

# =============================
# 4. Data Preprocessing
# =============================

# Fill missing values
train["Age"].fillna(train["Age"].median(), inplace=True)
test["Age"].fillna(test["Age"].median(), inplace=True)
train["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)
test["Fare"].fillna(test["Fare"].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
train["Sex"] = le.fit_transform(train["Sex"])
test["Sex"] = le.transform(test["Sex"])
train["Embarked"] = le.fit_transform(train["Embarked"])
test["Embarked"] = le.transform(test["Embarked"])

# Select features
X = train[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
y = train["Survived"]

# Split into train & validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_scaled = scaler.transform(test[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]])

# =============================
# 5. Model Training
# =============================
# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_val)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)

# =============================
# 6. Model Evaluation
# =============================
print("\nLogistic Regression Accuracy:", accuracy_score(y_val, y_pred_log))
print(confusion_matrix(y_val, y_pred_log))
print(classification_report(y_val, y_pred_log))

print("\nRandom Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print(confusion_matrix(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))

# Compare models
models = ["Logistic Regression", "Random Forest"]
scores = [accuracy_score(y_val, y_pred_log), accuracy_score(y_val, y_pred_rf)]

plt.figure(figsize=(6,4))
sns.barplot(x=models, y=scores, palette="coolwarm")
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.savefig("model_accuracy.png")
plt.show()

# =============================
# 7. Final Prediction for Submission
# =============================
final_predictions = rf_model.predict(test_scaled)  # Using Random Forest (better accuracy)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": final_predictions
})

submission.to_csv("submission.csv", index=False)
print("\nSubmission file created: submission.csv")

# =============================
# 8. Save Graphs for GitHub/LinkedIn
# =============================
print("\n All graphs saved (PNG) and submission.csv generated.")


FileNotFoundError: [Errno 2] No such file or directory: 'train (2).csv'