In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data

In [None]:
data = pd.DataFrame(data)
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode(), inplace=True)

In [None]:
data['Embarked'].isnull().sum()

In [None]:
data

In [None]:
survival_rate_by_sex = data.groupby('Sex')['Survived'].mean().sort_values(ascending=False)

In [None]:
sns.barplot(data=data, x=survival_rate_by_sex.index, y=survival_rate_by_sex.values, palette='husl')
plt.show()

In [None]:
data.Age.describe()

In [None]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
labels = ['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79']

data['age_group'] =pd.cut(data['Age'], bins=bins, right=False, labels=labels, include_lowest=True)

In [None]:
survival_rate_by_age = data.groupby('age_group')['Survived'].mean()

survival_rate_by_age_group = survival_rate_by_age.reset_index()
sns.barplot(x='age_group', y='Survived', data=survival_rate_by_age_group, palette='husl')

In [None]:
data.Fare.describe()

In [None]:
fare_bins = [0,50,100,200,300,400,500,600]
fare_labels =['0-50','51-100','101-200','201-300','301-400','401-500','501-600']
data['FareRange'] = pd.cut(data['Fare'], bins=fare_bins, labels=fare_labels, right=False, include_lowest=True)

In [None]:
survival_rate_by_fare = data.groupby('FareRange')['Survived'].mean()
survival_rate_by_farerange = survival_rate_by_fare.reset_index()
sns.barplot(data=survival_rate_by_farerange, x='FareRange',y='Survived', palette='flare')
plt.show()

In [None]:
sns.countplot(x='Embarked', hue='Survived', palette='husl', data=data)

In [None]:
num_feature = ['Fare', 'Age']
for i in num_feature:
    sns.displot(data[i], kde=True, color='navy')
    plt.show()

In [None]:
avg_age_by_sex = data.groupby('Sex')['Age'].mean().reset_index()
sns.barplot(x='Sex', y='Age', data=avg_age_by_sex, palette='Set3')

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=data, palette=['#D6A2E8','#25CCF7'])
plt.show()

In [None]:
sns.violinplot(x='Pclass', y='Fare', hue='Survived', data=data, split=True, pallete=['#F8EFBA','#FD7272'])

In [None]:
sns.violinplot(x='Sex', y='Age', hue='Survived', data=data, split=True, palette=['#58b19f','#f7f1e3'])

In [None]:
data['Familysize'] =data['SibSp']+data['Parch']+1

In [None]:
def family_size_cat(size):
    if size == 1:
        return 'Solo'
    elif 2<= size <=4:
        return 'Small'
    elif 5 <= size <= 7:
        return 'Medium'
    else:
        return 'Large'
    
data['FamilyCat'] =data['Familysize'].apply(family_size_cat)    

In [None]:
data

In [None]:
sns.countplot(x='FamilyCat', hue='Survived',data=data, palette=['#3dc1d3','#ea8685'])

In [None]:
data.drop(['Name', 'Cabin','Ticket','SibSp','Parch','Familysize','age_group','FareRange','PassengerId'], axis=1, inplace=True)

In [None]:
data

In [None]:
data.replace({'female':1,'male':0}, inplace=True)
data.shape

In [None]:
le = LabelEncoder()
data['FamilyCat'] = le.fit_transform(data['FamilyCat'])
data

In [None]:
embarked_dummies = pd.get_dummies(data['Embarked'], prefix='Embarked').drop('Embarked_S', axis=1)
data = pd.concat([data, embarked_dummies], axis=1)
data

In [None]:
X = data.drop(['Survived', 'Embarked'], axis=1)
y = data['Survived']
X

In [None]:
num_cols = ['Age', 'Fare']
scaler = StandardScaler()
scaler.fit(X[num_cols])
X[num_cols] = scaler.transform(X[num_cols])

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [None]:
models = {
    'RandomForest':(RandomForestClassifier(),
                   {'n_estimators':[50, 100, 200]}),
    'XGB':(XGBClassifier(),
          {'max_depth':[3, 5, 7]}),
    'AdaBoost':(AdaBoostClassifier(),
               {'n_estimators':[50,100,200]}),
    'DecisionTree':(DecisionTreeClassifier(),
                   {'max_depth':[None, 10,20, 30]}),
    'SVC':(SVC(),
          {'C':[0.1,1,10]}),
    'GradientBoosting':(GradientBoostingClassifier(),
                        {'n_estimators':[50, 100, 200]}),
    'KNN':(KNeighborsClassifier(),
          {'n_neighbor':[3,5,7]})
}

scorer_name = 'F1-score'
scorer = make_scorer(f1_score)


for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring=scorer, cv=5)
    grid_search.fit(X_train, y_train)
    
    print(f'Best hyperparameter for {name}:{grid_search.best_params_}')
    print(f'Best {scorer_name} score : {grid_search.best_score_:.2%}')
          
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
          
    print(f'Accuracy : {accuracy:.2%}')
    print(f'Precision : {precision:.2%}')
    print(f'Recall : {recall:.2%}')
    print(f'F1-score : {f1:.2%}')      
          
          

In [None]:
test = pd.read_csv("test.csv")

In [None]:
test_data = pd.DataFrame(test)
test_data.isnull().sum()

In [None]:
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

test_data.duplicated().sum()

In [None]:
test_data['Familysize'] = test_data['SibSp'] + test_data['Parch']+1
def family_size_cat(size):
    if size ==1:
        return 'Solo'
    elif 2<= size <=4:
        return 'Small'
    elif 5<= size <=7:
        return 'Medium'
    else:
        return 'Large'
    
test_data['FamilyCat'] = test_data['Familysize'].apply(family_size_cat)    

In [None]:
test_data.drop(['Name','Cabin','Ticket','SibSp','Parch','Familysize'], axis=1, inplace=True)

In [None]:
test_data.replace({'female':1, 'male':0}, inplace=True)
test_data['FamilyCat'] = le.fit_transform(test_data['FamilyCat'])
embarked_dummies = pd.get_dummies(test_data['Embarked'], prefix='Embarked').drop('Embarked_S',axis=1)
test_data = pd.concat([test_data, embarked_dummies], axis=1)
test_data.drop('Embarked', axis=1, inplace=True)

In [None]:
test_data.isnull().sum()

In [None]:
PassengerId = test_data['PassengerId']
test_data.drop('PassengerId', axis=1, inplace=True)

In [None]:
test_pred = best_model.predict(test_data)

In [None]:
submission = pd.DataFrame({'PassengerId': PassengerId, 'Survived': test_pred})
submission.to_csv('submission.csv', index=False)