In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pandas import DataFrame
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt

# 1.Import data

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

df_train = pd.DataFrame(train)
df_test = pd.DataFrame(test)

In [None]:
pd.set_option('display.max_columns', 32)

In [None]:
sample_submission

In [None]:
df_train

In [None]:
df_test

# 2. Preprocessing

In [None]:
df_all = pd.concat([df_train.drop(columns='target'),df_test],ignore_index=True)
df_all

In [None]:
# Search for missing data

msno.matrix(df=df_all, figsize=(20,14), color=(0,.3,.3))

In [None]:
# String label to categorical values

for i in range(df_all.shape[1]):
    if df_all.iloc[:,i].dtypes == object:
        lbl = LabelEncoder()
        lbl.fit(list(df_all.iloc[:,i].values))
        df_all.iloc[:,i] = lbl.transform(list(df_all.iloc[:,i].values))

In [None]:
df_all

In [None]:
df_train = pd.merge(df_all.iloc[df_train.index[0]:df_train.index[-1]+1],df_train['target'],left_index=True,right_index=True)
df_test = df_all.iloc[df_train.index[-1]+1:]

# 3. Histogram

In [None]:

sns.histplot(df_train['target'])

# 4. Check the correlation for each item

In [None]:
df_train_corr = df_train.corr()
df_train_corr

In [None]:
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(df_train_corr, vmax=.8, square=True)

In [None]:
# *************************************************************************
# 説明変数を指定
exVarArray = []
for i in df_train_corr:
    if df_train_corr[i]['target'] > 0.01 or df_train_corr[i]['target'] < -0.01:
        innerName = df_train_corr[i].name
        if innerName != 'target':
            exVarArray.append(innerName)
exVarArray
# *************************************************************************

In [None]:
x = DataFrame(df_train[exVarArray])
t = DataFrame(df_train['target'])

# numpyの配列に変換
x = np.array(x)
t = np.array(t)

# numpyで型を変換
t = t.ravel()

x = x.astype('float32')
t = t.astype('int32')
# 中を確認
print('x shape:', x.shape)
print(x[:10])
print('t shape:', t.shape)
print(t[:10])

In [None]:
# scaling
features = preprocessing.minmax_scale(x[:, :])

# split data for train and test
x_train, x_test, t_train, t_test = train_test_split(features, t.ravel(), test_size=0.3)

print(x_train)
print(x_test)
print(t_train)
print(t_test)

# 5.Learn with sckit-learn

In [None]:
clf = RFC(n_estimators=192,
          criterion='entropy',# 'gini' or 'entropy'
          max_depth=19,
          min_samples_split=2,
          max_features='log2',# 'auto'(='sqrt') or 'log2'
          n_jobs=-1,
          random_state=2525,
          verbose=1)# 0 or 1
clf.fit(x_train, t_train)
# 0.89000(n_estimators=192, criterion='entropy', max_depth=19, min_samples_split=2, max_features='log2', n_jobs=-1, random_state=2525)

In [None]:
predict = clf.predict_proba(x_test)[:, 1] # This grabs the positive class prediction
score = roc_auc_score(t_test, predict)
print(f'{score:0.5f}')

In [None]:
plt.figure(figsize=(8,4))
plt.hist(predict[np.where(t_test == 0)], bins=100, alpha=0.75, label='neg class')
plt.hist(predict[np.where(t_test == 1)], bins=100, alpha=0.75, label='pos class')
plt.legend()
plt.show()

# 6.Predict

In [None]:
# 説明変数を指定してnumpy型に変換
y = DataFrame(df_test[exVarArray])
y = np.array(y)

# scaling
y = preprocessing.minmax_scale(y[:, :])

# predict
result = clf.predict_proba(y)[:, 1]

In [None]:
result

In [None]:
if score > 0.8:
    sample_submission['target'] = result
    print(sample_submission)
else:
    print('Low score')

# 7. Prepare upload data

In [None]:
sample_submission.to_csv('submission.csv', index=False)