# Adversarial Validattion Using USE(Universal Sentence Encoder) and LightGBM

In [None]:
import numpy as np
import pandas as pd
import os
import gc
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

In [None]:
if 'selected_text' in train.columns:
    train.drop(columns='selected_text', inplace=True)
train['is_train'] = 1
test['is_train'] = 0
merged = pd.concat([train, test], sort=False)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(merged['sentiment'].values)
merged['sentiment'] = le.transform(merged['sentiment'].values)

In [None]:
merged.head(3)

In [None]:
err_text_ids = []
for i,t in enumerate(merged['text'].values):
    if type(t) != str:
        err_text_ids.append(merged.iloc[i, :]['textID'])

In [None]:
for e in err_text_ids:
    merged = merged.loc[merged['textID']!=e, :]

## USE(Universal Sentence Encoder)
You can download an USE model [here](https://tfhub.dev/google/universal-sentence-encoder/4).

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

use = hub.load('/kaggle/input/universalsentenceencoderlarge4')

In [None]:
texts = merged['text'].values
embedded = []
for t in texts:
    embedded.append(use([t])['outputs'].numpy().flatten())
embedded_df = pd.DataFrame(embedded)
embedded_df['textID'] = merged['textID'].values
merged=merged.merge(embedded_df, on='textID', how='left')

In [None]:
merged.head()

In [None]:
X = merged.iloc[:, 2:].drop(columns='is_train')
y = merged.iloc[:, 3]

## LightGBM

In [None]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

n_splits = 3
SEED = 3
EARLY_STOPPING = 200
oof = np.zeros(len(X))
categorical_features = ['sentiment']
skf = StratifiedKFold(n_splits=n_splits,random_state=SEED, shuffle=True)
for train_idx, valid_idx in skf.split(X, y):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
    watchlist = [train_data, valid_data]

    params = {
        "objective": "binary",
        "num_leaves": 50,
        "learning_rate": 0.02,
        "bagging_freq": 5,
        "bagging_fraction": 0.5,
        "feature_fraction": 0.8,
        "metric": "binary_logloss",
        'device':'gpu',
        'gpu_id':0,
        'updater':'grow_gpu_hist'
    }

    model_lgb = lgb.train(params, train_set=train_data, num_boost_round=999, valid_sets=watchlist, verbose_eval=101, early_stopping_rounds=EARLY_STOPPING)
    y_pred_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    oof[valid_idx] += y_pred_valid
    gc.collect()
oof /= n_splits-1

In [None]:
data = {
    'textID':merged['textID'].values,
    'is_train': y.values,
    'oof':oof
}
oof_df = pd.DataFrame(data).sort_values(by='oof')
oof_df.head()