In [None]:
import cupy as cp
import cudf
import cuml

from cuml.experimental.preprocessing import StandardScaler
from cuml.metrics import roc_auc_score

from cuml.preprocessing.TargetEncoder import TargetEncoder
from cuml import LogisticRegression
from cuml.neighbors import KNeighborsClassifier



In [None]:
!ls ../input/tabular-playground-series-mar-2021

In [None]:
train = cudf.read_csv('../input/tabular-playground-series-mar-2021/train.csv' )
test  = cudf.read_csv('../input/tabular-playground-series-mar-2021/test.csv' )
print(train.shape, test.shape)

train['istest'] = 0
test['istest'] = 1
alldf = cudf.concat( (train, test), sort=False )
alldf['fold'] = cp.arange(alldf.shape[0]) % 5
print(alldf.shape)

print( cudf.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv' ) )

In [None]:
print(alldf.head())
print(alldf.columns)

In [None]:
#Convert categorical features to int32
for col in range(19):
    alldf['cat'+str(col)] = alldf['cat'+str(col)].factorize()[0].astype(cp.int32)
print( alldf.head() )

In [None]:
for col in range(11):
    alldf['cont'+str(col)] = (50*train['cont'+str(col)]).round().astype(cp.int32)
    alldf['cat'+str(col+19)] = alldf['cont'+str(col)].factorize()[0].astype(cp.int32)

In [None]:
train = alldf.loc[ alldf.istest==0 ].copy()
test = alldf.loc[ alldf.istest==1 ].copy()

del alldf
print(train.shape, test.shape)

In [None]:
for col in train.columns[1:-1]:
    print( col, train[col].nunique() )

In [None]:
for feat0 in range(29):
    featname = ['cat'+str(feat0)]
    
    score = []
    for fold in range(5):
        train_fold = train.loc[train.fold!=fold]
        valid_fold = train.loc[train.fold==fold]
        TE = TargetEncoder(n_folds=5, smooth=0.01, seed=2021)
        train_encoded = TE.fit_transform(train_fold[featname], train_fold.target)
        valid_encoded = TE.transform(valid_fold[featname])
        auc_train = roc_auc_score(train_fold.target, train_encoded)
        auc_valid = roc_auc_score(valid_fold.target, valid_encoded)
        score.append( auc_train )
        score.append( auc_valid )
        
    print( featname, cp.mean(cp.array(score)) )

In [None]:
for feat0 in range(29):
    featname = ['cat16']
    fn = 'cat'+str(feat0)
    if fn not in featname:
        featname.append(fn)
    
        score = []
        for fold in range(5):
            train_fold = train.loc[train.fold!=fold]
            valid_fold = train.loc[train.fold==fold]
            TE = TargetEncoder(n_folds=5, smooth=0.01, seed=2021)
            train_encoded = TE.fit_transform(train_fold[featname], train_fold.target)
            valid_encoded = TE.transform(valid_fold[featname])
            auc_train = roc_auc_score(train_fold.target, train_encoded)
            auc_valid = roc_auc_score(valid_fold.target, valid_encoded)
            score.append( auc_train )
            score.append( auc_valid )

        print( featname, cp.mean(cp.array(score)) )

In [None]:
features_cat = ['cat'+str(i) for i in range(19)]
features_cont = ['cont'+str(i) for i in range(11)]

train['lr0'] = 0.
test['lr0'] = 0.
for fold in range(5):
    train_fold = train.loc[train.fold!=fold].copy()
    valid_fold = train.loc[train.fold==fold].copy()
    test_fold = test.copy()
    
    #Target encode all categorical features for this specific folds split.
    for feat0 in range(19):
        col = 'cat'+str(feat0)
        TE = TargetEncoder(n_folds=5, smooth=0.01, seed=2021)
        train_fold[col] = TE.fit_transform(train_fold[col], train_fold.target)
        valid_fold[col] = TE.transform(valid_fold[col])
        test_fold[col] = TE.transform(test_fold[col])
    
    scaler = StandardScaler()
    train_fold[features_cat+features_cont] = scaler.fit_transform(train_fold[features_cat+features_cont])
    valid_fold[features_cat+features_cont] = scaler.transform(valid_fold[features_cat+features_cont])
    test_fold[features_cat+features_cont] = scaler.transform(test_fold[features_cat+features_cont])
    
    model = LogisticRegression(C=1)
    model.fit( train_fold[features_cat+features_cont], train_fold.target )
    
    valid_pred = model.predict_proba( valid_fold[features_cat+features_cont] )[1]
    print( fold, roc_auc_score( valid_fold.target, valid_pred ) )
    
    test_pred = model.predict_proba( test_fold[features_cat+features_cont] )[1]
    
    train.loc[train.fold==fold,'lr0'] = valid_pred.values
    test['lr0'] += test_pred
    
test['lr0'] /= 5.

In [None]:
features_cat = ['cat'+str(i) for i in range(19)]
features_cont = ['cont'+str(i) for i in range(11)]

train['knn0'] = 0.
test['knn0'] = 0.
for fold in range(5):
    train_fold = train.loc[train.fold!=fold].copy()
    valid_fold = train.loc[train.fold==fold].copy()
    test_fold = test.copy()
    
    #Target encode all categorical features for this specific folds split.
    for feat0 in range(19):
        col = 'cat'+str(feat0)
        TE = TargetEncoder(n_folds=5, smooth=0.0005, seed=2021)
        train_fold[col] = TE.fit_transform(train_fold[col], train_fold.target)
        valid_fold[col] = TE.transform(valid_fold[col])
        test_fold[col] = TE.transform(test_fold[col])
    
    scaler = StandardScaler()
    train_fold[features_cat+features_cont] = scaler.fit_transform(train_fold[features_cat+features_cont])
    valid_fold[features_cat+features_cont] = scaler.transform(valid_fold[features_cat+features_cont])
    test_fold[features_cat+features_cont] = scaler.transform(test_fold[features_cat+features_cont])
    
    model = KNeighborsClassifier(n_neighbors=200)
    model.fit( train_fold[features_cat+features_cont], train_fold.target )
    
    valid_pred = model.predict_proba( valid_fold[features_cat+features_cont] )[1]
    print( fold, roc_auc_score( valid_fold.target, valid_pred ) )
    
    test_pred = model.predict_proba( test_fold[features_cat+features_cont] )[1]
    
    train.loc[train.fold==fold,'knn0'] = valid_pred.values
    test['knn0'] += test_pred
    
test['knn0'] /= 5.

In [None]:
featname1 = ['cat16','cat10','cat14','cat0','cat11','cat15']
featname2 = ['cat5', 'cat1', 'cat8', 'cat18']

TE = TargetEncoder(n_folds=5, smooth=0.0005, seed=2021)
train_encoded1 = TE.fit_transform(train[featname1], train.target)
test_encoded1 = TE.transform(test[featname1])

train_encoded2 = TE.fit_transform(train[featname2], train.target)
test_encoded2 = TE.transform(test[featname2])

train['te0'] = train_encoded1
train['te1'] = train_encoded2
test['te0'] = test_encoded1
test['te1'] = test_encoded2

roc_auc_score(train.target, train_encoded1+train_encoded2)

In [None]:
train[['te0','te1','lr0','knn0']].corr()

In [None]:
test[['te0','te1','lr0','knn0']].corr()

In [None]:
#train['ypred'] = (train['te0'].rank() + 1*train['te1'].rank() + train['lr0'].rank() + train['knn0'].rank())
#train['ypred'] = train['ypred'] / train['ypred'].max()
train['ypred'] = train['knn0']

roc_auc_score(train.target, train.ypred )

In [None]:
#test['target'] = (test['te0'].rank() + test['te1'].rank() + test['lr0'] + test['knn0'].rank())
#test['target'] = test['target'] / test['target'].max()
test['target'] = test['knn0']
print( test.head() )

In [None]:
train['ypred'].to_pandas().hist(bins=100, density=True)
test['target'].to_pandas().hist(bins=100, density=True, alpha=0.5)

In [None]:
train['ypred'].describe()

In [None]:
test['target'].describe()

In [None]:
test[['id','target']].to_csv('submission.csv', index=False)