In [1]:
import pandas as pd
train=pd.read_csv('./data/preprocessed/train1006.csv')
test=pd.read_csv('./data/preprocessed/test1006.csv')

In [2]:
X=train.drop(columns=['Survived'])
y=train['Survived']

# modeling

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.3,
                                                    stratify = y,
                                                    random_state = 2045)

print('Train Data : ', X_train.shape, y_train.shape)
print('Test Data : ', X_test.shape, y_test.shape)

Train Data :  (623, 7) (623,)
Test Data :  (268, 7) (268,)


In [22]:
from sklearn.linear_model import LogisticRegression

LR1 = LogisticRegression(C = 0.3,
                        penalty = 'l2',
                        multi_class = 'multinomial',
                        n_jobs = -1)

LR1.fit(X_train, y_train)

LogisticRegression(C=0.3, multi_class='multinomial', n_jobs=-1)

In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

print(accuracy_score(y_test, LR1.predict(X_test)), '\n')
print(confusion_matrix(y_test, LR1.predict(X_test)), '\n')
print(roc_auc_score(y_test, LR1.predict(X_test)))

0.832089552238806 

[[151  14]
 [ 31  72]] 

0.8070903206825537


In [47]:
LR2 = LogisticRegression(C = 0.3,
                        penalty = 'l2',
                        multi_class = 'multinomial',
                        n_jobs = -1)

LR2.fit(X, y)

LogisticRegression(C=3, multi_class='multinomial', n_jobs=-1)

In [None]:
pred=LR2.predict(test)
sub=pd.read_csv('./submissions/submission.csv')
sub['Survived']=pred
sub.to_csv('./submissions/LR_1.csv',index=False)

# tuning

In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, train_test_split

In [25]:
params={'C': [0.01, 0.03, 0.05, 0.1, 0.15 , 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1,3,5,10]}

In [26]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [27]:
Model_LR=LogisticRegression()

gcv_LR=GridSearchCV(Model_LR,
                    param_grid=params,
                    scoring='roc_auc',
                    cv=KFold(n_splits=5,
                             shuffle=True,
                             random_state=2045),
                   refit=True)

In [29]:
X=train.drop(columns=['Survived'])
y=train['Survived']
gcv_LR.fit(X,y)

GridSearchCV(cv=KFold(n_splits=5, random_state=2045, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
                               0.6, 0.7, 1, 3, 5, 10]},
             scoring='roc_auc')

In [31]:
gcv_LR.best_params_

{'C': 3}

In [32]:
gcv_LR.best_score_

0.8558817164771056

In [33]:
pred=gcv_LR.predict(test)
sub=pd.read_csv('./submissions/submission.csv')
sub['Survived']=pred
sub.to_csv('./submissions/GCV_LR_1.csv',index=False)

# VC

In [50]:
import pandas as pd
train=pd.read_csv('./data/preprocessed/train1006.csv')
test=pd.read_csv('./data/preprocessed/test1006.csv')

In [51]:
X=train.drop(columns=['Survived'])
y=train['Survived']

In [52]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

## RF & LR

In [53]:
RF=RandomForestClassifier()
LR = LogisticRegression(C = 3,
                        penalty = 'l2',
                        multi_class = 'multinomial',
                        n_jobs = -1)

VC1=VotingClassifier(estimators=[('rf',RF),('lr',LR)],voting='soft')

In [54]:
VC1.fit(X,y)

VotingClassifier(estimators=[('rf', RandomForestClassifier()),
                             ('lr',
                              LogisticRegression(C=3, multi_class='multinomial',
                                                 n_jobs=-1))],
                 voting='soft')

In [55]:
pred=VC1.predict(test)
sub=pd.read_csv('./submissions/submission.csv')
sub['Survived']=pred
sub.to_csv('./submissions/VC_by_RF_and_LR.csv',index=False)

In [56]:
pred

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

## XGB & LR

In [57]:
XGB=XGBClassifier()
LR = LogisticRegression(C = 3,
                        penalty = 'l2',
                        multi_class = 'multinomial',
                        n_jobs = -1)

VC2=VotingClassifier(estimators=[('xgb',XGB),('lr',LR)],voting='soft')
VC2.fit(X,y)





VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=None, reg_alpha=None,
 

In [58]:
pred=VC2.predict(test)
sub=pd.read_csv('./submissions/submission.csv')
sub['Survived']=pred
sub.to_csv('./submissions/VC_by_XGB_and_LR.csv',index=False)

In [59]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

## XGB & RF & LR

In [60]:
RF=RandomForestClassifier()
XGB=XGBClassifier()
LR = LogisticRegression(C = 3,
                        penalty = 'l2',
                        multi_class = 'multinomial',
                        n_jobs = -1)

VC3=VotingClassifier(estimators=[('rf',RF),('xgb',XGB),('lr',LR)],voting='soft')
VC3.fit(X,y)





VotingClassifier(estimators=[('rf', RandomForestClassifier()),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                 

In [61]:
pred=VC3.predict(test)
sub=pd.read_csv('./submissions/submission.csv')
sub['Survived']=pred
sub.to_csv('./submissions/VC_by_XGB&RF&LR.csv',index=False)