In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(r'../data/raw/train.csv')

#split train-test set
x_train, x_test, y_train, y_test = train_test_split(train_df['text'].values, train_df['target'].values, test_size=0.2, random_state=42)

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(x_train)


x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in x_train_sequences)

# Pad sequences to the same length
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_sequence_length)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length)

In [3]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense


model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=200, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True, dropout=0.2))
model.add(LSTM(512, dropout=0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=32)



Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 112ms/step - accuracy: 0.6760 - loss: 0.6020 - val_accuracy: 0.8135 - val_loss: 0.4539
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 117ms/step - accuracy: 0.8697 - loss: 0.3216 - val_accuracy: 0.7965 - val_loss: 0.4810
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 108ms/step - accuracy: 0.9240 - loss: 0.1999 - val_accuracy: 0.7643 - val_loss: 0.5247
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 123ms/step - accuracy: 0.9627 - loss: 0.1169 - val_accuracy: 0.7538 - val_loss: 0.7293
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 117ms/step - accuracy: 0.9694 - loss: 0.0897 - val_accuracy: 0.7610 - val_loss: 0.8394
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 130ms/step - accuracy: 0.9793 - loss: 0.0519 - val_accuracy: 0.7636 - val_loss: 1.0226
Epoch 7/10

<keras.src.callbacks.history.History at 0x24f2e7b96d0>

In [9]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_texts(texts, tokenizer, max_sequence_length):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="tf")


# Load the pretrained DistilBERT classifier
classifier = TFDistilBertForSequenceClassification.from_pretrained(preset, num_labels=2)

# Preprocess the texts
inputs = preprocess_texts(train_df['text'].str, tokenizer, max_sequence_length)

# Display the summary of the classifier
classifier.summary()

# Make predictions (if needed)
outputs = classifier(inputs)
predictions = tf.nn.softmax(outputs.logits, axis=-1)
#print(predictions)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [4]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7214 - loss: 1.3254
Test Loss: 1.2722
Test Accuracy: 0.7400


In [5]:
#save model
model.save(r'../models/DL_MODEL.keras')

In [6]:
import keras

text_test_df = pd.read_csv(r'../data/raw/test.csv')

text_test_df = text_test_df['text'].values

test_df_sequences = tokenizer.texts_to_sequences(text_test_df)

# Pad sequences to the same length
test_df_padded = pad_sequences(test_df_sequences, maxlen=max_sequence_length)

model = keras.models.load_model(r'../models/DL_MODEL.keras')

pred = model.predict(test_df_padded)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step


In [7]:
import numpy as np
pred2=[np.argmax(x) for x in pred]

test = pd.read_csv(r'../data/raw/test.csv')
test["target"]=pred2
test.head()
submission = test[['id','target']]

In [8]:
submission.to_csv(r'../data/final/submission.csv',index=False)

submission = pd.read_csv(r'../data/final/submission.csv')
submission.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.399939
std,3146.427221,0.489961
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0


In [9]:
submission["target"].value_counts()

target
0    1958
1    1305
Name: count, dtype: int64

In [10]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [11]:
import kaggle

# Replace 'submission.csv' with the path to your submission file
submission_file = r'../data/final/submission.csv'

# Replace 'Message' with your submission message
submission_message = 'DL model try'

# Call the submit function from kaggle package
#kaggle.api.competition_submit(submission_file, submission_message, competition='nlp-getting-started')