# Hi! Congratulations to all teams and the 1st place https://www.kaggle.com/garyongguanjie who has taken rank 1 in the last hour!!

### This is my first time writing a notebook and also my first coding competition. It has been a very fun ride and I'd really appreciate any comments on things I can improve.

### Thanks to Indra Lin for writing cool notebook in which my notebook is mostly based on with some slight adjustments : https://www.kaggle.com/indralin/text-processing-augmentation-tpu-baseline-0-4544 /

### The Final score is obtained by ensembling Bert-base, GPT-2, and ROBERTa with some distribution adjustments.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

print('Using Tensorflow version:', tf.__version__)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
             texts, 
             return_attention_masks=False, 
             return_token_type_ids=False,
             pad_to_max_length=True,
             max_length=maxlen)
    
    return np.array(enc_di['input_ids'])

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

## Based on our experiment number of epochs trained for best validation score are as follows:

### BERT: 2 Epochs
### GPT-2 : 4 Epochs
### ROBERTA: 4 Epochs

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2    
BATCH_SIZE = 16 * strategy.num_replicas_in_sync


## The training data used is the combination of the Shopee given training data, the data previous testset data and the scraped data Thanks to Tony Ng : https://www.kaggle.com/c/student-shopee-code-league-sentiment-analysis/discussion/170953

In [None]:
train_df = pd.read_csv('../input/sentiment-data/train (add leak).csv')
original_df = train_df.copy()
added_df = pd.read_csv('../input/shopee-reviews/shopee_reviews.csv')
test_df = pd.read_csv('../input/sentiment-data/test.csv')

train_df.drop('review_id', axis=1, inplace=True)

print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)

## Review some train data that matches the test set

In [None]:
review_train = train_df['review'].tolist()
review_test = test_df['review'].tolist()

print(len(set(review_train).intersection(set(review_test))))

same_data_list = list(set(review_train).intersection(set(review_test)))
same_data_list[0:5]

In [None]:
added_df = added_df.rename(columns={'label': 'rating','text':'review'})

In [None]:
added_df.iloc[1431262]

In [None]:
# Drop the trash data
added_df = added_df.drop(1431262)

## Append original train with the scraped training data

In [None]:
train_df = train_df.append(added_df,ignore_index = True)  ## Use this to use both shopee data and scraped data

In [None]:
train_df['rating'] = train_df['rating'].astype(int)

In [None]:
train_df['rating'].value_counts()

## Inspecting Data leak

### The data scraped may match the test set reviews so I wanted to verify that the most of the scraped data doesn't match the reviews in the test set (I do not want to feel guilty about using too much leaked data.)

In [None]:
review_train = added_df['review'].tolist()
review_test = test_df['review'].tolist()
#Inspect data leak (after adding scraped data)
matched_reviews = set(review_train).intersection(set(review_test))
print('Matched reviews from scraped data and the test set:', len(matched_reviews))

### Seems like there're some leakage but most of them are general review like "Awesome product", "Fast respond" etc...

### The cleaning below is mostly based on Indra Lin's Notebook with some adjustments

In [None]:
import emoji
def emoji_cleaning(text):
    
    # Change emoji to text
    text = emoji.demojize(text).replace(":", " ")
    
    # Delete repeated emoji
    tokenizer = text.split()
    repeated_list = []
    
    for word in tokenizer:
        if word not in repeated_list:
            repeated_list.append(word)
    
    text = ' '.join(text for text in repeated_list)
    text = text.replace("_", " ").replace("-", " ")
    return text

In [None]:
have_emoji_train_idx = []
have_emoji_test_idx = []

for idx, review in enumerate(train_df['review']):
    if any(char in emoji.UNICODE_EMOJI for char in review):
        have_emoji_train_idx.append(idx)
        
for idx, review in enumerate(test_df['review']):
    if any(char in emoji.UNICODE_EMOJI for char in review):
        have_emoji_test_idx.append(idx)

In [None]:
train_emoji_percentage = round(len(have_emoji_train_idx) / train_df.shape[0] * 100, 2)
print(f'Train data has {len(have_emoji_train_idx)} rows that used emoji, that means {train_emoji_percentage} percent of the total')

test_emoji_percentage = round(len(have_emoji_test_idx) / test_df.shape[0] * 100, 2)
print(f'Test data has {len(have_emoji_test_idx)} rows that used emoji, that means {test_emoji_percentage} percent of the total')

In [None]:
train_df_original = train_df.copy()
test_df_original = test_df.copy()

# emoji_cleaning
train_df.loc[have_emoji_train_idx, 'review'] = train_df.loc[have_emoji_train_idx, 'review'].apply(emoji_cleaning)
test_df.loc[have_emoji_test_idx, 'review'] = test_df.loc[have_emoji_test_idx, 'review'].apply(emoji_cleaning)

In [None]:
# before cleaning
train_df_original.loc[have_emoji_train_idx, 'review'].tail()

In [None]:
# after cleaning
train_df.loc[have_emoji_train_idx, 'review'].tail()

### EDA some effects of punctuation

In [None]:
import string
string.punctuation

In [None]:
# Prints the distribution of the train set
original_df['rating'].value_counts(normalize = True)

In [None]:
for punc in string.punctuation:
    print(punc)
    print(original_df[original_df['review'].str.contains(punc,regex=False)].rating.value_counts(normalize = True))
    print('------------------------------------------------------------')

### Seems like reviews with different punctuation has different target distribution which may help our model predict better!
### Fortunately, pre-trained models has ids representing most punctuation so this is good for us!

In [None]:
import re
def review_cleaning(text):
    
    # delete lowercase and newline
    text = text.lower()
    text = re.sub(r'\n', '', text)
    text = re.sub('([.,!?()])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)
    
    # change emoticon to text
    text = re.sub(r':\(', 'dislike', text)
    text = re.sub(r': \(\(', 'dislike', text)
    text = re.sub(r':, \(', 'dislike', text)
    text = re.sub(r':\)', 'smile', text)
    text = re.sub(r';\)', 'smile', text)
    text = re.sub(r':\)\)\)', 'smile', text)
    text = re.sub(r':\)\)\)\)\)\)', 'smile', text)
    text = re.sub(r'=\)\)\)\)', 'smile', text)
    
    # We decide to include punctuation in the model so we comment this line out!
#     text = re.sub('[^a-z0-9! ]', ' ', text)
    
    tokenizer = text.split()
    
    return ' '.join([text for text in tokenizer])

In [None]:
train_df['review'] = train_df['review'].apply(review_cleaning)
test_df['review'] = test_df['review'].apply(review_cleaning)

In [None]:
repeated_rows_train = []
repeated_rows_test = []

for idx, review in enumerate(train_df['review']):
    if re.match(r'\w*(\w)\1+', review):
        repeated_rows_train.append(idx)
        
for idx, review in enumerate(test_df['review']):
    if re.match(r'\w*(\w)\1+', review):
        repeated_rows_test.append(idx)

In [None]:
def delete_repeated_char(text):
    
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    
    return text

In [None]:
train_df.loc[repeated_rows_train, 'review'] = train_df.loc[repeated_rows_train, 'review'].apply(delete_repeated_char)
test_df.loc[repeated_rows_test, 'review'] = test_df.loc[repeated_rows_test, 'review'].apply(delete_repeated_char)

In [None]:
print('Before: ', train_df_original.loc[92129, 'review'])
print('After: ', train_df.loc[92129, 'review'])

print('\nBefore: ', train_df_original.loc[56938, 'review'])
print('After: ', train_df.loc[56938, 'review'])

print('\nBefore: ', train_df_original.loc[72677, 'review'])
print('After: ', train_df.loc[72677, 'review'])

print('\nBefore: ', train_df_original.loc[36558, 'review'])
print('After: ', train_df.loc[36558, 'review'])

In [None]:
def recover_shortened_words(text):
    
    # put \b (boundary) for avoid the characters in the word to be replaced
    # I only make a few examples here, you can add if you're interested :)
    
    text = re.sub(r'\bapaa\b', 'apa', text)
    
    text = re.sub(r'\bbsk\b', 'besok', text)
    text = re.sub(r'\bbrngnya\b', 'barangnya', text)
    text = re.sub(r'\bbrp\b', 'berapa', text)
    text = re.sub(r'\bbgt\b', 'banget', text)
    text = re.sub(r'\bbngt\b', 'banget', text)
    text = re.sub(r'\bgini\b', 'begini', text)
    text = re.sub(r'\bbrg\b', 'barang', text)
    
    text = re.sub(r'\bdtg\b', 'datang', text)
    text = re.sub(r'\bd\b', 'di', text)
    text = re.sub(r'\bsdh\b', 'sudah', text)
    text = re.sub(r'\bdri\b', 'dari', text)
    text = re.sub(r'\bdsni\b', 'disini', text)
    
    text = re.sub(r'\bgk\b', 'gak', text)
    
    text = re.sub(r'\bhrs\b', 'harus', text)
    
    text = re.sub(r'\bjd\b', 'jadi', text)
    text = re.sub(r'\bjg\b', 'juga', text)
    text = re.sub(r'\bjgn\b', 'jangan', text)
    
    text = re.sub(r'\blg\b', 'lagi', text)
    text = re.sub(r'\blgi\b', 'lagi', text)
    text = re.sub(r'\blbh\b', 'lebih', text)
    text = re.sub(r'\blbih\b', 'lebih', text)
    
    text = re.sub(r'\bmksh\b', 'makasih', text)
    text = re.sub(r'\bmna\b', 'mana', text)
    
    text = re.sub(r'\borg\b', 'orang', text)
    
    text = re.sub(r'\bpjg\b', 'panjang', text)
    
    text = re.sub(r'\bka\b', 'kakak', text)
    text = re.sub(r'\bkk\b', 'kakak', text)
    text = re.sub(r'\bklo\b', 'kalau', text)
    text = re.sub(r'\bkmrn\b', 'kemarin', text)
    text = re.sub(r'\bkmrin\b', 'kemarin', text)
    text = re.sub(r'\bknp\b', 'kenapa', text)
    text = re.sub(r'\bkcil\b', 'kecil', text)
    
    text = re.sub(r'\bgmn\b', 'gimana', text)
    text = re.sub(r'\bgmna\b', 'gimana', text)
    
    text = re.sub(r'\btp\b', 'tapi', text)
    text = re.sub(r'\btq\b', 'thanks', text)
    text = re.sub(r'\btks\b', 'thanks', text)
    text = re.sub(r'\btlg\b', 'tolong', text)
    text = re.sub(r'\bgk\b', 'tidak', text)
    text = re.sub(r'\bgak\b', 'tidak', text)
    text = re.sub(r'\bgpp\b', 'tidak apa apa', text)
    text = re.sub(r'\bgapapa\b', 'tidak apa apa', text)
    text = re.sub(r'\bga\b', 'tidak', text)
    text = re.sub(r'\btgl\b', 'tanggal', text)
    text = re.sub(r'\btggl\b', 'tanggal', text)
    text = re.sub(r'\bgamau\b', 'tidak mau', text)
    
    text = re.sub(r'\bsy\b', 'saya', text)
    text = re.sub(r'\bsis\b', 'sister', text)
    text = re.sub(r'\bsdgkan\b', 'sedangkan', text)
    text = re.sub(r'\bmdh2n\b', 'semoga', text)
    text = re.sub(r'\bsmoga\b', 'semoga', text)
    text = re.sub(r'\bsmpai\b', 'sampai', text)
    text = re.sub(r'\bnympe\b', 'sampai', text)
    text = re.sub(r'\bdah\b', 'sudah', text)
    
    text = re.sub(r'\bberkali2\b', 'repeated', text)
    
    text = re.sub(r'\byg\b', 'yang', text)
    
    return text

In [None]:
%%time
train_df['review'] = train_df['review'].apply(recover_shortened_words)

In [None]:
rating_mapper_encode = {1: 0,
                        2: 1,
                        3: 2,
                        4: 3,
                        5: 4}

# convert back to original rating after prediction later(dont forget!!)
rating_mapper_decode = {0: 1,
                        1: 2,
                        2: 3,
                        3: 4,
                        4: 5}

train_df['rating'] = train_df['rating'].map(rating_mapper_encode)

In [None]:
train_df['rating'].value_counts()

### Since the scraped data mostly contains rating = 5 (4 in a 0-4 scale). Sadly, we did not have much time to try to choose what to include so we have to downsampled to about ~500k of rating = 5 to account for class imbalance and reduce training time.

In [None]:
from sklearn.utils import resample
df_majority = train_df[train_df.rating==4]
df_other = train_df[train_df.rating!=4]

df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=500000,     # to match minority class
                                 random_state=123) # reproducible results

train_df = pd.concat([df_majority_downsampled, df_other])


### We tried to include class weights in our model but it seems it doesnt help with our predictions. (~2% worse)

In [None]:
# zero,one,two,three,four = np.bincount(train_df['rating'])
# total = zero + one + two + three + four


# weight_for_0 = (1 / zero)*(total)/5 
# weight_for_1 = (1 / one)*(total)/5
# weight_for_2 = (1 / two)*(total)/5
# weight_for_3 = (1 / three)*(total)/5
# weight_for_4 = (1 / four)*(total)/5

# class_weight = {0: weight_for_0, 1: weight_for_1, 2:weight_for_2,3:weight_for_3,4:weight_for_4}
# class_weight

### Let's drop some duplicates

In [None]:
train_df = train_df.drop_duplicates(subset ="review")

## Training time! (Thanks to Indra Lin again!)

In [None]:
from tensorflow.keras.utils import to_categorical

# convert to one-hot-encoding-labels
train_labels = to_categorical(train_df['rating'], num_classes=5)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_df['review'],
                                                  train_labels,
                                                  stratify=train_labels,
                                                  test_size=0.1,
                                                  random_state=2020)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
MODEL = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFAutoModel.from_pretrained('bert-base-uncased')

### It seems that most of the reviews are about 300words. I have experiemented MAX_LEN = 512 but the run time takes a little too long so we go with 320 just to be safe.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(train_df['review'].str.len())
MAX_LEN = 320
plt.show()

In [None]:
X_train_encode = regular_encode(X_train.values, tokenizer, maxlen=MAX_LEN)
X_val_encode = regular_encode(X_val.values, tokenizer, maxlen=MAX_LEN)
X_test_encode = regular_encode(test_df['review'].values, tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train_encode, y_train))
    .repeat()
    .shuffle(1024)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_val_encode, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test_encode)
    .batch(BATCH_SIZE)
)

### Adding a dropout layer have improved our performance ~2%

In [None]:
def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    sequence_output = tf.keras.layers.Dropout(0.2)(sequence_output)   
    cls_token = sequence_output[:, 0, :]
    out = Dense(5, activation='softmax')(cls_token) # 5 ratings to predict
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
n_steps = X_train.shape[0] // BATCH_SIZE

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

# Get training and test loss histories
training_loss = train_history.history['loss']
test_loss = train_history.history['val_loss']

# Create count of the number of epochs
epoch_count = range(1, len(training_loss) + 1)

# Visualize loss history
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
pred = model.predict(test_dataset, verbose=1)

In [None]:
# Check if this works

import pickle
with open('pred_bert.pkl','wb') as f:
    pickle.dump(pred, f)

In [None]:
pred_sentiment = np.argmax(pred,axis = 1)

## Model Loading
### We use the code above to create multiple BERT, GPT-2, and ROBERTA models and the final result is a simple average of the models (GPT-2,BERT, and ROBERTA)

# BERT

In [None]:
import pickle

In [None]:
with open('../input/lastday/bert-cased-500k.pkl','rb') as f:
    pred_bert500_cased = pickle.load(f)
    print(pred_bert500_cased.shape)

In [None]:
with open('../input/lastday/bert-uncased-500k.pkl','rb') as f:
    pred_bert500_uncased = pickle.load(f)
    print(pred_bert500_uncased.shape)

In [None]:
with open('../input/lastday/bert-based-500k-320length.pkl','rb') as f:
    pred_bert500_cased_len320 = pickle.load(f)
    print(pred_bert500_cased_len320.shape)

In [None]:
pred_bert = (pred_bert500_cased + pred_bert500_uncased + pred_bert500_cased_len320)/3

# GPT-2

In [None]:
with open('../input/lastday/gpt2_pred_500k_len250.pkl','rb') as f:
    pred_gpt500 = pickle.load(f)
    print(pred_gpt500.shape)

In [None]:
with open('../input/add-scrape/gpt2_pred_4epoch_400.pkl','rb') as f:
    pred_gpt = pickle.load(f)
    print(pred_gpt.shape)

In [None]:
pred_gpt = (pred_gpt500 + pred_gpt)/2

# ROBERTA

In [None]:
with open('../input/add-scrape/ROBERTA_pred_4epoch_subscore66.pkl','rb') as f:
    pred_roberta = pickle.load(f)
    print(pred_roberta.shape)

# Ensembling and Distribution Adjustments

* Our method of distribution adjustments is to keep confident predictions and throw out inconfident predictions to other class by adjusting probabilities threshold.

* Note that a more optimal method can be found in these 2 notebooks
    * garyong : (https://www.kaggle.com/garyongguanjie/lb-dist-hacking-final)   
    * Team Servants of the Joy : (https://www.kaggle.com/huikang/week6-process-gpu-tpu-output)
* But this is my first time and I'm happy with my results haha.

* Note that the public distribution is [0.11388, 0.02350, 0.06051, 0.39692, 0.40519]

In [None]:
pred = (pred_bert + pred_gpt + pred_roberta ) / 3

In [None]:
final_pred = []
confident_3_index = []
for idx,p in enumerate(pred):
    if np.argmax(p) == 2 and p[2]>0.438: 
#         print(idx)
        confident_3_index.append(idx)
        final_pred.append(2)  #because it's 0-4
    else:
        p[2] = 0
        final_pred.append(np.argmax(p))
final_pred
submission = pd.DataFrame({'review_id':[i+1 for i in range(60427)],'rating':final_pred})
submission['rating'].value_counts(normalize = True)


In [None]:
final_pred = []
confident_4_index = []
for idx,p in enumerate(pred):
    if np.argmax(p) == 3 and p[3]>0.34: 
#         print(idx)
        confident_3_index.append(idx)
        final_pred.append(3)  #because it's 0-4
    else:
        p[3] = 0
        final_pred.append(np.argmax(p))
final_pred
submission = pd.DataFrame({'review_id':[i+1 for i in range(60427)],'rating':final_pred})
submission['rating'].value_counts(normalize = True)
# [0.11388, 0.02350, 0.06051, 0.39692, 0.40519]

In [None]:
final_pred = []
for idx,p in enumerate(pred):
    if np.argmax(p) == 4 and p[4]>0.2: 
#         print(idx)
        final_pred.append(4)  #because it's 0-4
    else:
        p[4] = 0
        if p[0] > 0.14:
            final_pred.append(0)
        else:
            final_pred.append(np.argmax(p))
final_pred
submission = pd.DataFrame({'review_id':[i+1 for i in range(60427)],'rating':final_pred})
submission['rating'].value_counts(normalize = True)
# [0.11388, 0.02350, 0.06051, 0.39692, 0.40519]

In [None]:
rating_mapper_decode = {0: 1,
                        1: 2,
                        2: 3,
                        3: 4,
                        4: 5}

submission['rating'] = submission['rating'].map(rating_mapper_decode)

submission.to_csv('3model_ensemble.csv', index=False)

In [None]:
# Public test set distribution  : [0.11388, 0.02350, 0.06051, 0.39692, 0.40519]
submission.rating.value_counts(normalize = True)

# Thank you all for reading through my notebook till the end. I've gained alot of experience learning from you all (especially from public notebooks) and I hope that I'll be able to learn more. See you all again soon! :)