In [32]:
import pandas as pd

# Load data
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")  # Make sure test.csv exists in your working directory


# Define the clean function
def clean(df):
    df = df.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)

    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        df[col].fillna(df[col].median(), inplace=True)

    df.Embarked.fillna("U", inplace=True)
    return df

# Clean both datasets
df = clean(df)
test = clean(test)


In [33]:
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [34]:
from sklearn.preprocessing import LabelEncoder

# Ensure columns are strings before encoding
df['Sex'] = df['Sex'].astype(str)
test['Sex'] = test['Sex'].astype(str)
df['Embarked'] = df['Embarked'].astype(str)
test['Embarked'] = test['Embarked'].astype(str)

# Encode 'Sex'
le_sex = LabelEncoder()
combined_sex = pd.concat([df['Sex'], test['Sex']])
le_sex.fit(combined_sex)

df['Sex'] = le_sex.transform(df['Sex'])
test['Sex'] = le_sex.transform(test['Sex'])

# Encode 'Embarked'
le_embarked = LabelEncoder()
combined_embarked = pd.concat([df['Embarked'], test['Embarked']])
le_embarked.fit(combined_embarked)

df['Embarked'] = le_embarked.transform(df['Embarked'])
test['Embarked'] = le_embarked.transform(test['Embarked'])

print("Sex classes:", le_sex.classes_)
print("Embarked classes:", le_embarked.classes_)

df.head()


Sex classes: ['female' 'male']
Embarked classes: ['C' 'Q' 'S' 'U']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Separate features and target
y = df["Survived"]
X = df.drop("Survived", axis=1)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)



In [37]:
predictions = clf.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val,predictions)

0.8100558659217877