In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [5]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Data preprocessing
def preprocess_data(df):
    # Drop irrelevant columns
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    # Encode categorical variables
    encoder = LabelEncoder()
    df['Sex'] = encoder.fit_transform(df['Sex'])
    df['Embarked'] = encoder.fit_transform(df['Embarked'].fillna('N'))
    
    # Fill missing values in Age and Fare with median
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    #df['Age'].fillna(df['Age'].mean(), inplace=True)
    #df['Fare'].fillna(df['Fare'].mean(), inplace=True)
    #df['Age'].fillna(df['Age'].mode()[0], inplace=True)
    #df['Fare'].fillna(df['Fare'].mode()[0], inplace=True)
    return df

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Feature selection
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model evaluation on validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Model accuracy on validation set: {accuracy:.2f}")

# Make predictions on test set
test_predictions = model.predict(test_data)

# Create a submission DataFrame using original 'PassengerId' values
Result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})

# Save predictions in a CSV file
Result.to_csv('Result.csv', index=False)


Model accuracy on validation set: 0.83


""' now lets use Hyperparameter to see if we can improve our model accuracy or not """

In [3]:
from sklearn.model_selection import GridSearchCV

In [6]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Data preprocessing
def preprocess_data(df):
    # Drop irrelevant columns
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    # Encode categorical variables
    encoder = LabelEncoder()
    df['Sex'] = encoder.fit_transform(df['Sex'])
    df['Embarked'] = encoder.fit_transform(df['Embarked'].fillna('N'))
    
    # Fill missing values in Age and Fare with median
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    return df

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Feature selection
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Model training with best hyperparameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Model evaluation on validation set
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Model accuracy on validation set: {accuracy:.2f}")

# Make predictions on test set
test_predictions = best_model.predict(test_data)

# Create a submission DataFrame using original 'PassengerId' values
Result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})

# Save predictions in a CSV file
Result.to_csv('HyperparameterResult.csv', index=False)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


Best Hyperparameters: {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Model accuracy on validation set: 0.82


""" the accuracy does not change much, so lets try another model """

In [4]:
from xgboost import XGBClassifier

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Data preprocessing
def preprocess_data(df):
    # Drop irrelevant columns
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    # Encode categorical variables
    encoder = LabelEncoder()
    df['Sex'] = encoder.fit_transform(df['Sex'])
    df['Embarked'] = encoder.fit_transform(df['Embarked'].fillna('N'))
    
    # Fill missing values in Age and Fare with median
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    return df

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Feature selection
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)

# Model evaluation on validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Model accuracy on validation set: {accuracy:.2f}")

# Make predictions on test set
test_predictions = model.predict(test_data)

# Create a submission DataFrame using original 'PassengerId' values
Result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})

# Save predictions in a CSV file
Result.to_csv('Result.csv', index=False)