In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.plotting.register_matplotlib_converters()

import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pycaret

In [None]:
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("Set2", 10)
sns.set_palette(pal)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train.info()


대부분 데이터를 완전히 사용할 수 있습니다. 일부 열에는 5 ~ 7 %의 데이터 만 누락되어 있고 Cabin 열에 만 70 %의 데이터가 누락되어 있으므로 해당 열을 삭제해야합니다.

In [None]:
train = train.drop(['Cabin'],axis=1)
test = test.drop(['Cabin'],axis=1)

Sex - Survived 비교

In [None]:
sns.countplot(x='Sex', hue='Survived', data=train)

결측값 처리
1. Age
2. Ticket
3. Fare
4. Embarked

In [None]:
sns.displot(train, x="Age", kind="kde", hue="Survived")

In [None]:
plt.hist(x=train.Age, bins=10)
plt.grid(axis='y', alpha=0.5)
plt.xlabel('Age')

In [None]:
# For Missing Values
print('Total no. of observations:', train[train['Age'].isnull()].shape[0])
a = train[train['Age'].isnull()].groupby('Survived').Survived.count()
b = pd.DataFrame({'Survived': a.index, '%age' : a.values*100/a.values.sum()})
print(b.to_string(index=False))

In [None]:
mu=int(train['Age'].mean())

print('Total no. of observations:', train[train['Age']==mu].shape[0])
a = train[train['Age']==mu].groupby('Survived').Survived.count()
b = pd.DataFrame({'Survived': a.index, '%age' : a.values*100/a.values.sum()})
print(b.to_string(index=False))

평균과 분포 일치

In [None]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(train['Age'].mean(), inplace=True)

In [None]:
# Ticket
sns.displot(train, x="Fare", kind="kde", hue='Survived')

고 요금 승객이 남보다 더 많이 살아남는 것을 볼 수있는 매우 불균형 한 분포

In [None]:
train.groupby('Pclass').Fare.mean()

보시다시피 Fare는 Pclass와 관련이 있으므로 그에 따라 대치 할 수 있습니다.

In [None]:
c1 = 92.65
c2 = 25.76
c3 = 20.62
l=[]
s=set(train['Fare'])
for i in range(train['Fare'].shape[0]):
    if train['Fare'][i] not in s:
        if train['Pclass'][i]==1:
            l.append(c1)
        elif train['Pclass'][i]==2:
            l.append(c2)
        else:
            l.append(c3)
    else:
        l.append(train['Fare'][i])
        
train['Fare'] = l

l=[]
s=set(test['Fare'])
for i in range(test['Fare'].shape[0]):
    if test['Fare'][i] not in s:
        if test['Pclass'][i]==1:
            l.append(c1)
        elif test['Pclass'][i]==2:
            l.append(c2)
        else:
            l.append(c3)
    else:
        l.append(test['Fare'][i])
    
test['Fare'] = l

In [None]:
# Ticket
train.groupby('Ticket').Ticket.count().sort_values(ascending=False).head(20)

In [None]:
train.groupby(['Ticket', 'Survived']).Survived.count().sort_values(ascending=False).head(20)

해당 오류는 향후 numpy 버전에서 문법 변경의 가능성이 있다는 메시지이므로 당황하지 않고 일단 진행

In [None]:
train['Ticket'] = train['Ticket'].str.replace('[^a-zA-Z]', 'r').str[:1]

test['Ticket'] = test['Ticket'].str.replace('[^a-zA-Z]', 'r').str[:1]

In [None]:
train['Ticket'].fillna('Random',inplace=True)
l=[]
for i in train['Ticket']:
    if i == 'r':
        l.append('Random')
    else:
        l.append(i)
        
train['Ticket'] = l


test['Ticket'].fillna('Random',inplace=True)
l=[]
for i in test['Ticket']:
    if i == 'r':
        l.append('Random')
    else:
        l.append(i)
        
test['Ticket'] = l

In [None]:
train.groupby('Ticket').Ticket.count().sort_values(ascending=False).head(20)

In [None]:
sns.countplot(x='Ticket', hue='Survived', data=train)

In [None]:
#Embarked
train.groupby('Embarked').Embarked.count()

In [None]:
sns.countplot(x='Embarked', hue='Survived', data=train)

In [None]:
train.groupby(['Embarked','Survived']).Survived.count()

In [None]:
train[train['Embarked'].isnull()].groupby('Survived').Survived.count()

여기서 우리는 S 인 모드를 대치하거나 살아남은 것과 가장 동일한 비율을 가진 범주 인 Q를 대치하는 두 가지 옵션이 있습니다.

In [None]:
# imputing Q
l=[]
for i in train['Embarked']:
    if i in[np.nan]:
        l.append('S')
    else:
        l.append(i)
        
train['Embarked'] = l


l=[]
for i in test['Embarked']:
    if i in[np.nan]:
        l.append('S')
    else:
        l.append(i)
        
test['Embarked'] = l

모든 누락 된 값이 처리되었습니다.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['LastName'] = train['Name'].str.split(', ').str[0:1]
test['LastName'] = test['Name'].str.split(', ').str[0:1]

In [None]:
l=[]
for i in train['LastName']:
    
    l.append(i[0])
    
train['LastName'] = l


l=[]
for i in test['LastName']:
    l.append(i[0])
    
test['LastName'] = l

In [None]:
df = train.groupby('LastName').Survived.agg(['count','mean']).sort_values(by='count',ascending=False)
df = df.reset_index()
df.head(20)

In [None]:
df2 = test.groupby('LastName').LastName.agg(['count']).sort_values(by='count',ascending=False)
df2 = df2.reset_index()
df2.head(20)

In [None]:
d = {}
for i in range(df.shape[0]):
    d[df['LastName'][i]] = df['count'][i]

for i in range (df2.shape[0]):
    if df2['LastName'][i] not in d.keys():
        d[df2['LastName'][i]] = 0
    d[df2['LastName'][i]] += df2['count'][i]

In [None]:
l=[]
for i in train['LastName']:
    if(d[i]>=5):
        l.append(i)
    else:
        l.append('RandomSurname')
    
train['LastName'] = l


l=[]
for i in test['LastName']:
    if(d[i]>=5):
        l.append(i)
    else:
        l.append('RandomSurname')
    
test['LastName'] = l

In [None]:
## 범주 변수 인코딩 성별, 항구
le =LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])
train['Embarked'] = le.fit_transform(train['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])
train['Ticket'] = le.fit_transform(train['Ticket'])
test['Ticket'] = le.transform(test['Ticket'])
le.fit(list(train['LastName'])+list(test['LastName']))
train['LastName']=le.transform(train['LastName'])
test['LastName'] = le.transform(test['LastName'])

In [None]:
sns.countplot(x='Parch',hue='Survived',data=train)

In [None]:
sns.countplot(x='SibSp',hue='Survived',data=train)

우리는 Parch와 Sibsp를 함께 결합하는 또 다른 기능을 만들 수 있습니다.

In [None]:
train['FamOnBoard'] = train['Parch']+train['SibSp']
test['FamOnBoard'] = test['Parch']+test['SibSp']
l=[]
l2=[]
for i in train['FamOnBoard']:
    if i ==0:
        l.append(1)
    else:
        l.append(0)


train['Alone'] = l




l=[]
l2=[]
for i in test['FamOnBoard']:
    if i ==0:
        l.append(1)
    else:
        l.append(0)

test['Alone'] = l

In [None]:
X = train.drop(['PassengerId','Survived','Name'],axis=1)
X_test = test.drop(['PassengerId','Name'],axis=1)   
y = train['Survived']

In [None]:
X.head()

In [None]:
sns.countplot(x='FamOnBoard',hue='Survived',data=train)

In [None]:
sns.countplot(x='Alone',hue='Survived',data=train)

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(X.corr(),annot=True,vmin=-1,vmax=1,cmap='coolwarm')

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2,random_state=42)


Pycaret을 사용한 모델 선택

기준선 비교

In [None]:
from pycaret.classification import *
clf1 = setup(data=train.drop(['PassengerId','Name'],axis=1),remove_outliers=True,remove_multicollinearity=True,target='Survived',silent=True,train_size=0.8, imputation_type='simple')

In [None]:
models()

In [None]:
compare_models()

상위 3 개 모델 생성 및 조정

In [None]:
lgbm = create_model('lightgbm')

In [None]:
tuned_lgbm = tune_model(lgbm)

In [None]:
cb = create_model('catboost',fold=5)

In [None]:
tuned_cb = tune_model(cb,fold=5)

In [None]:
gbc = create_model('gbc',fold=5)

In [None]:
tuned_gbc = tune_model(gbc,fold=5)

Blending

In [None]:
blender = blend_models(estimator_list = [tuned_lgbm,tuned_cb,tuned_gbc], method = 'soft')

Stacking

In [None]:
stacker = stack_models(estimator_list = [tuned_lgbm,tuned_cb,tuned_gbc], meta_model=tuned_lgbm)


예측하기


개별 모델 예측

In [None]:
df = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':tuned_lgbm.predict(X_test)})
df.to_csv('submit.csv',index=False)

In [None]:
df2 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':tuned_cb.predict(X_test)})
df2.to_csv('submit2.csv',index=False)

In [None]:
df3 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':tuned_gbc.predict(X_test)})
df3.to_csv('submit3.csv',index=False)


블렌더 모델 예측

In [None]:
df4 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':blender.predict(X_test)})
df4.to_csv('submit4.csv',index=False)

스태커 모델 예측

In [None]:
df5 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':stacker.predict(X_test)})
df5.to_csv('submit5.csv',index=False)

In [None]:
df6 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':lgbm.predict(X_test)})
df6.to_csv('submit6.csv',index=False)