In [41]:
import numpy as np 
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [42]:
train = pd.read_csv("/kaggle/input/dacon-competition/competition_data/train.csv")
test = pd.read_csv("/kaggle/input/dacon-competition/competition_data/test.csv")
submit = pd.read_csv("/kaggle/input/dacon-competition/competition_data/sample_submission.csv")

In [43]:
train.info()

# EDA

## Q

In [44]:
Q_col = list(train.loc[:,train.columns[1:27]].columns)
train_Q = train.loc[:,Q_col]

display(train_Q.head())
display(train_Q.describe())

In [45]:
plt.rcParams['figure.figsize'] = [10,20]

for i in range(len(Q_col)):
    plt.subplot(7,4,i+1)
    plt.subplots_adjust(hspace=0.4) #subplot 높이 비율 조절 
    train_Q.iloc[:,i].value_counts().plot(kind='bar')
    plt.ylim(0,8000)
    plt.title(train_Q.columns[i])
    if i%4 != 0 :
        plt.yticks([]) #각 줄의 첫번째 그래프만 y축 표시

## country

In [46]:
train['country'].value_counts()

범주형 데이터 원핫인코딩 시키기 위해 빈도 100 미만인 국가 other로 대체

In [47]:
country_list = train['country'].value_counts().to_frame()
country_list = country_list.loc[country_list['country'] >= 100]

def func_country(x):
    if x in country_list.index:
        return x
    else:
        return "other"
    
train['country'] = train['country'].apply(lambda x:func_country(x))
test['country'] = test['country'].apply(lambda x:func_country(x))

## TIPI

In [48]:
T_col = list(train.loc[:,train.columns[31:41]].columns)
train_T = train.loc[:,T_col]

display(train_T.head())
display(train_T.describe())

In [49]:
plt.rcParams['figure.figsize'] = [10,17]

for i in range(len(T_col)):
    plt.subplot(4,3,i+1)
    plt.subplots_adjust(hspace=0.4) #subplot 높이 비율 조절 
    train_T.iloc[:,i].value_counts().plot(kind='bar')
    plt.ylim(0,7500)
    plt.title(train_T.columns[i])
    if i%3 != 0 :
        plt.yticks([]) #각 줄의 첫번째 그래프만 y축 표시

## education

In [50]:
train['education'].value_counts()

## urban

In [51]:
train['urban'].value_counts()

## gender

In [52]:
train['gender'].value_counts()

## engnat

In [53]:
train['engnat'].value_counts()

## hand

In [54]:
train['hand'].value_counts()

## religion

In [55]:
train['religion'].value_counts()

## orientation

In [56]:
train['orientation'].value_counts()

## voted

In [57]:
train['voted'].value_counts()

## married

In [58]:
train['married'].value_counts()

## familysize

In [59]:
train['familysize'].value_counts()

In [60]:
train = train.drop(train[train['familysize'] > 50].index)

## ASD

In [61]:
train['ASD'].value_counts()

## Correlation

In [62]:
df = train.loc[:,'education':'nerdiness']
plt.figure(figsize=(12,12))
sns.heatmap(data = df.corr(),annot=True, cmap='Blues')

In [63]:
df1 = train.loc[:,'Q1':'Q26']
df2 = train['nerdiness']
df = pd.concat([df1,df2],axis=1)
df

plt.figure(figsize=(12,12))
sns.heatmap(data = df.corr(),annot=True, cmap='Blues')

In [64]:
df1 = train.loc[:,'country':'TIPI10']
df2 = train['nerdiness']
df = pd.concat([df1,df2],axis=1)
df

plt.figure(figsize=(12,12))
sns.heatmap(data = df.corr(),annot=True, cmap='Blues')

# Data Processing

## missing value

In [65]:
Q_col = list(train.loc[:,train.columns[1:27]].columns)
T_col = list(train.loc[:,train.columns[31:41]].columns)

for i in Q_col:
    train.loc[:,i].fillna(train.loc[:,i].mode()[0],inplace=True)
    #print(train.loc[:,i].head())
    
for i in T_col:
    train.loc[:,i].fillna(train.loc[:,i].mode()[0],inplace=True)
    #print(train.loc[:,i].head())
    
train['country'].fillna('other',inplace=True) #feature engineering
train['education'].fillna(train['education'].mode()[0],inplace=True)
train['gender'].fillna(train['gender'].mode()[0],inplace=True)
train['engnat'].fillna(train['engnat'].mode()[0],inplace=True)
train['hand'].fillna(train['hand'].mode()[0],inplace=True)
train['religion'].fillna(train['religion'].mode()[0],inplace=True)
train['orientation'].fillna(train['orientation'].mode()[0],inplace=True)
train['voted'].fillna(train['voted'].mode()[0],inplace=True)
train['married'].fillna(train['married'].mode()[0],inplace=True)
train['familysize'].fillna(train['familysize'].mode()[0],inplace=True) #feature engineering
train['ASD'].fillna(train['ASD'].mode()[0],inplace=True)


print(tuple(train.isnull().sum()))

## one hot encoding

In [66]:
train = pd.get_dummies(train, columns=['country'])
test = pd.get_dummies(test, columns=['country'])

## train test split

In [67]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation,BatchNormalization

# 상관관계 낮은 변수 제거
train = train.drop(columns=['index','age','introelapse','testelapse'])
test = test.drop(columns=['index','age','introelapse','testelapse'])

X = train.drop(columns=['nerdiness'])
Y = train['nerdiness']

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2)

## Scaling

In [68]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_X = pd.DataFrame(scaler.fit_transform(train_X))
test_X = pd.DataFrame(scaler.fit_transform(test_X))
test = pd.DataFrame(scaler.fit_transform(test))

In [69]:
test.fillna(0,inplace=True)
test.isnull().sum()

# Modeling

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,roc_auc_score

model = RandomForestClassifier(n_estimators=3000, random_state=42, n_jobs=-1)
model.fit(train_X,train_Y)

In [71]:
pred = model.predict_proba(test_X)[:,1]

print(roc_auc_score(test_Y,pred))

## Hyper parameter tuning

In [73]:
model_md5 = RandomForestClassifier(max_depth=5)
model_md10 = RandomForestClassifier(max_depth=10)
model_md15 = RandomForestClassifier(max_depth=15)
model_md20 = RandomForestClassifier(max_depth=20)
model_md30 = RandomForestClassifier(max_depth=30)

model_md5.fit(train_X,train_Y)
model_md10.fit(train_X,train_Y)
model_md15.fit(train_X,train_Y)
model_md20.fit(train_X,train_Y)
model_md30.fit(train_X,train_Y)

pred_md5 = model_md5.predict_proba(test_X)[:,1]
pred_md10 = model_md10.predict_proba(test_X)[:,1]
pred_md15 = model_md15.predict_proba(test_X)[:,1]
pred_md20 = model_md20.predict_proba(test_X)[:,1]
pred_md30 = model_md30.predict_proba(test_X)[:,1]

print("max depth 5 AUC:",roc_auc_score(test_Y,pred_md5))
print("max depth 10 AUC:",roc_auc_score(test_Y,pred_md10))
print("max depth 15 AUC:",roc_auc_score(test_Y,pred_md15))
print("max depth 20 AUC:",roc_auc_score(test_Y,pred_md20))
print("max depth 30 AUC:",roc_auc_score(test_Y,pred_md30))

In [74]:
model_ne100 = RandomForestClassifier(n_estimators=100)
model_ne500 = RandomForestClassifier(n_estimators=500)
model_ne1000 = RandomForestClassifier(n_estimators=1000)
model_ne2000 = RandomForestClassifier(n_estimators=2000)
model_ne3000 = RandomForestClassifier(n_estimators=3000)

model_ne100.fit(train_X,train_Y)
model_ne500.fit(train_X,train_Y)
model_ne1000.fit(train_X,train_Y)
model_ne2000.fit(train_X,train_Y)
model_ne3000.fit(train_X,train_Y)

pred_ne100 = model_ne100.predict_proba(test_X)[:,1]
pred_ne500 = model_ne500.predict_proba(test_X)[:,1]
pred_ne1000 = model_ne1000.predict_proba(test_X)[:,1]
pred_ne2000 = model_ne2000.predict_proba(test_X)[:,1]
pred_ne3000 = model_ne3000.predict_proba(test_X)[:,1]

print("n estimators 100 AUC:",roc_auc_score(test_Y,pred_ne100))
print("n estimators 500 AUC:",roc_auc_score(test_Y,pred_ne500))
print("n estimators 1000 AUC:",roc_auc_score(test_Y,pred_ne1000))
print("n estimators 2000 AUC:",roc_auc_score(test_Y,pred_ne2000))
print("n estimators 3000 AUC:",roc_auc_score(test_Y,pred_ne3000))

In [75]:
model_mf1 = RandomForestClassifier(max_features=1)
model_mf5 = RandomForestClassifier(max_features=5)
model_mf10 = RandomForestClassifier(max_features=10)
model_mf20 = RandomForestClassifier(max_features=20)
model_mfat = RandomForestClassifier(max_features='auto')

model_mf1.fit(train_X,train_Y)
model_mf5.fit(train_X,train_Y)
model_mf10.fit(train_X,train_Y)
model_mf20.fit(train_X,train_Y)
model_mfat.fit(train_X,train_Y)

pred_mf1 = model_mf1.predict_proba(test_X)[:,1]
pred_mf5 = model_mf5.predict_proba(test_X)[:,1]
pred_mf10 = model_mf10.predict_proba(test_X)[:,1]
pred_mf20 = model_mf20.predict_proba(test_X)[:,1]
pred_mfat = model_mfat.predict_proba(test_X)[:,1]

print("max features 1 AUC:",roc_auc_score(test_Y,pred_mf1))
print("max features 5 AUC:",roc_auc_score(test_Y,pred_mf5))
print("max features 10 AUC:",roc_auc_score(test_Y,pred_mf10))
print("max features 20 AUC:",roc_auc_score(test_Y,pred_mf20))
print("max features at AUC:",roc_auc_score(test_Y,pred_mfat))

In [76]:
model_final = RandomForestClassifier(n_estimators=2000,
                                    max_depth=20,
                                    max_features=5,
                                    random_state=42)

model_final.fit(train_X,train_Y)

In [81]:
pred_final = model_final.predict_proba(test_X)[:,1]

print(roc_auc_score(test_Y,pred_final))

## sumit - 0.88369

In [82]:
pred_ned = model_final.predict_proba(test)[:,1]
submit['nerdiness'] = pred_ned
submit

In [83]:
submit.to_csv('submit_7.csv',index=False)