# Changelog

### Version 9

* Use unprocessed dataset from external dataset

### Version 7

* Use unprocessed dataset

### Version 6

* Trying modified proba based on public LB distribution

### Version 5

* Trying modifying prediction probability based on public LB distribution

### Version 4

* Use `xlnet-base-cased`
* Change batch_size 320 to 128
* Change epoch 5 to 2
* Change LR 5e-5 to 3e-4

### Version 3

* Change batch_size 128 to 320
* Change epoch 3 to 5
* Change LR 3e-5 to 5e-5

### Version 2

* Use `distilroberta-base`
* Tidy code cell position
* Change batch_size & maxlen parameter

### Version 1

* Initial code

In [None]:
!pip install ktrain

In [None]:
import os
import random
import gc

import numpy as np
import pandas as pd
import ktrain

In [None]:
!pip freeze > requirements.txt

In [None]:
print('Numpy version:', np.__version__)
print('Pandas version:', pd.__version__)
print('ktrain version:', ktrain.__version__)

In [None]:
SEED = 42

os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Check system specifiction 

In [None]:
!lscpu

In [None]:
!free -m

In [None]:
!nvidia-smi

# Dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/train.csv')
df_train

In [None]:
df_train2 = pd.read_csv('/kaggle/input/shopee-reviews/shopee_reviews.csv')
df_train2 = df_train2[df_train2['label'] != 'label']
df_train2

In [None]:
df_test = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/test.csv')
df_test

In [None]:
X_train = pd.concat([df_train['review'], df_train2['text']], axis=0).reset_index(drop=True)
X_test = df_test['review']
y_train = pd.concat([df_train['rating'], df_train2['label']], axis=0).reset_index(drop=True)

# Preprocess dataset

In [None]:
t = ktrain.text.Transformer('distilroberta-base', maxlen=65, classes=[str(r) for r in range(1, 6)])

In [None]:
y_train = y_train.apply(lambda r: str(r))

# to fix this issue https://github.com/huggingface/transformers/issues/3809
X_train = X_train.replace({'': '.'})
X_test = X_test.replace({'': '.'})

In [None]:
train = t.preprocess_train(X_train.to_list(), y_train.to_list())

In [None]:
gc.collect()

# Train

In [None]:
model = t.get_classifier()

In [None]:
model.summary()

In [None]:
learner = ktrain.get_learner(model, train_data=train, batch_size=320)

In [None]:
# Google recommender LR : 2e-5 to 5e-5
learner.fit_onecycle(3e-4, 5)

In [None]:
gc.collect()

# Test

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
y_test_pred = predictor.predict(X_test.to_list())
y_test_pred = [np.int32(y) for y in y_test_pred]

In [None]:
df_submission = pd.concat([pd.Series(list(range(1,60428)), name='review_id', dtype=np.int32), pd.Series(y_test_pred, name='rating')], axis=1)
df_submission.to_csv('submission_preprocess_text.csv', index=False)

df_submission

In [None]:
df_test = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/test.csv')
y_test_pred2 = predictor.predict(df_test['review'].to_list())

In [None]:
df_submission2 = pd.concat([pd.Series(list(range(1,60428)), name='review_id', dtype=np.int32), pd.Series(y_test_pred2, name='rating')], axis=1)
df_submission2.to_csv('submission_raw_text.csv', index=False)

df_submission2

In [None]:
y_test_pred3 = predictor.predict(X_test.to_list(), return_proba=True)
# for i in range(len(y_test_pred3)):
#     y_test_pred3[i, 0] = y_test_pred3[i, 0] * 0.11388
#     y_test_pred3[i, 1] = y_test_pred3[i, 1] * 0.02350
#     y_test_pred3[i, 2] = y_test_pred3[i, 2] * 0.06051
#     y_test_pred3[i, 3] = y_test_pred3[i, 4] * 0.39692
#     y_test_pred3[i, 4] = y_test_pred3[i, 3] * 0.40519
y_test_pred3 = np.argmax(y_test_pred3, axis=1)
for i in range(len(y_test_pred3)):
    y_test_pred3[i] = y_test_pred3[i] + 1
y_test_pred3 = [np.int32(y) for y in y_test_pred]

In [None]:
df_submission = pd.concat([pd.Series(list(range(1,60428)), name='review_id', dtype=np.int32), pd.Series(y_test_pred3, name='rating')], axis=1)
df_submission.to_csv('submission_preprocess_text_mod_proba.csv', index=False)

df_submission