In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Read in the training and test sets
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

# Preprocess the data

# Identify most relevant features
# You can use techniques like feature importance or correlation analysis to help you identify the most important features
relevant_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train_df[relevant_features] = imputer.fit_transform(train_df[relevant_features])
test_df[relevant_features] = imputer.transform(test_df[relevant_features])

# Encode categorical variables as numeric
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})
train_df['Embarked'] = train_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_df['Embarked'] = test_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Transform skewed or non-normal features
# Instead of normalizing all of the numeric features, you could try using techniques like log transformation or Box-Cox transformation to make the distribution of a feature more normal
scaler = StandardScaler()
train_df[relevant_features] = scaler.fit_transform(train_df[relevant_features])
test_df[relevant_features] = scaler.transform(test_df[relevant_features])

# Split the data into features (X) and labels (y)
X_train = train_df[relevant_features]
y_train = train_df['Survived']
X_test = test_df[relevant_features]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

# Train the model
log_reg = LogisticRegression()
bTree = GradientBoostingClassifier(min_samples_split=10)
svm = SVC()
model = VotingClassifier(estimators=[('lr', log_reg), ('dt', bTree), ('svm', svm)])
model.fit(X_train, y_train)


# Fine-tune the model
param_grid = {'n_estimators': [50, 100, 200]}
grid_search = GridSearchCV(bTree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_bTree = grid_search.best_estimator_
print("Best parameters for Boosted forest: ", grid_search.best_params_)

param_grid = {'C': [0.1, 1, 10]}
grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
print("Best parameters for SVM: ", grid_search.best_params_)

param_grid = {'C': [0.1, 1, 10]}
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_log_reg = grid_search.best_estimator_
print("Best parameters for logistic regression: ", grid_search.best_params_)

# Retrain the model with the best individual parameters
model = VotingClassifier(estimators=[('lr', best_log_reg), ('dt', best_bTree), ('svm', best_svm)])
model.fit(X_train, y_train)

# Evaluate the fine-tuned model
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy: ", accuracy)

# Fine-tune the voting classifier model using grid search
param_grid = {'weights': [[1, 1, 1], [2, 1, 1], [1, 2, 1], [1, 1, 2], [2, 2, 1], [2, 1, 2], [1, 2, 2], [2, 2, 2]]}
model = VotingClassifier(estimators=[('lr', log_reg), ('dt', bTree), ('svm', svm)])
model.fit(X_train, y_train)
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Best parameters for voting classifier: ", grid_search.best_params_)

# Evaluate the fine-tuned model
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy again: ", accuracy)

# Evaluate the logistic regression classifier
scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print("Accuracy of logistic regression classifier: ", scores.mean())

# Evaluate the bTree classifier
scores = cross_val_score(bTree, X_train, y_train, cv=5)
print("Accuracy of Boosted forest classifier: ", scores.mean())

# Evaluate the SVM classifier
scores = cross_val_score(svm, X_train, y_train, cv=5)
print("Accuracy of SVM classifier: ", scores.mean())


# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Save the predictions to a CSV file
output = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': y_pred})
output.to_csv('submission.csv', index=False)


Accuracy:  0.8435754189944135                                 
Best parameters for random forest:  {'n_estimators': 200}                                        
Best parameters for SVM:  {'C': 1}                                        
Best parameters for logistic regression:  {'C': 0.1}                                        