In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(r'../data/raw/train.csv')

#split train-test set
x_train, x_test, y_train, y_test = train_test_split(train_df['text'].values, train_df['target'].values, test_size=0.2, random_state=42)

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(x_train)


x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in x_train_sequences)

# Pad sequences to the same length
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_sequence_length)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length)

In [4]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense


model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=200, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True, dropout=0.2))
model.add(LSTM(512, dropout=0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=32)

Epoch 1/10




[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 109ms/step - accuracy: 0.6635 - loss: 0.6025 - val_accuracy: 0.7978 - val_loss: 0.4600
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 113ms/step - accuracy: 0.8644 - loss: 0.3239 - val_accuracy: 0.8004 - val_loss: 0.4874
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 116ms/step - accuracy: 0.9198 - loss: 0.2040 - val_accuracy: 0.7905 - val_loss: 0.5357
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 117ms/step - accuracy: 0.9528 - loss: 0.1335 - val_accuracy: 0.7702 - val_loss: 0.6529
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 116ms/step - accuracy: 0.9689 - loss: 0.0789 - val_accuracy: 0.7656 - val_loss: 0.8653
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 116ms/step - accuracy: 0.9826 - loss: 0.0580 - val_accuracy: 0.7774 - val_loss: 1.0023
Epoch 7/10
[1m191/19

<keras.src.callbacks.history.History at 0x1e6459fd8e0>

In [19]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.data import Dataset

# Define the model preset and other parameters
preset = "distilbert-base-uncased"
sequence_length = 160
num_classes = 2
BATCH_SIZE = 32
EPOCHS = 2

# Assuming x_train, x_test are lists of texts and y_train, y_test are lists of labels
# Assuming these are defined previously in your code

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(preset)

# Define a function to preprocess texts using the tokenizer
def preprocess_texts(texts, tokenizer, max_sequence_length): 
    return tokenizer(texts, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="tf")

# Preprocess the training and validation texts
X_train = preprocess_texts([str(element) for element in x_train], tokenizer, sequence_length)
X_val = preprocess_texts([str(element) for element in x_test], tokenizer, sequence_length)

# Load the pretrained DistilBERT classifier
classifier = TFDistilBertForSequenceClassification.from_pretrained(preset, num_labels=num_classes)

# Display the summary of the classifier model
classifier.summary()

# Compile the model
classifier.compile(
    loss=SparseCategoricalCrossentropy(from_logits=True),
    optimizer=Adam(learning_rate=1e-5),
    metrics=["accuracy"]
)

# Convert input data to the proper format
def convert_to_dataset(inputs, labels):
    return Dataset.from_tensor_slices((dict(inputs), labels)).batch(BATCH_SIZE)

# Create TensorFlow datasets
train_dataset = convert_to_dataset(X_train, y_train)
val_dataset = convert_to_dataset(X_val, y_test)

# Fit the model
history = classifier.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Model: "tf_distil_bert_for_sequence_classification_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_99 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66955010 (255.41 MB)
Trainable params: 66955010 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


ValueError: Could not interpret optimizer identifier: <keras.src.optimizers.adam.Adam object at 0x000001E6459FC8F0>

In [5]:
#save model
model.save(r'../models/DL_MODEL.keras')

In [6]:
import keras

text_test_df = pd.read_csv(r'../data/raw/test.csv')

text_test_df = text_test_df['text'].values

test_df_sequences = tokenizer.texts_to_sequences(text_test_df)

# Pad sequences to the same length
test_df_padded = pad_sequences(test_df_sequences, maxlen=max_sequence_length)

model = keras.models.load_model(r'../models/DL_MODEL.keras')

pred = model.predict(test_df_padded)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step


In [7]:
import numpy as np
pred2=[np.argmax(x) for x in pred]

test = pd.read_csv(r'../data/raw/test.csv')
test["target"]=pred2
test.head()
submission = test[['id','target']]

In [8]:
submission.to_csv(r'../data/final/submission.csv',index=False)

submission = pd.read_csv(r'../data/final/submission.csv')
submission.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.399939
std,3146.427221,0.489961
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0


In [9]:
submission["target"].value_counts()

target
0    1958
1    1305
Name: count, dtype: int64

In [10]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [11]:
import kaggle

# Replace 'submission.csv' with the path to your submission file
submission_file = r'../data/final/submission.csv'

# Replace 'Message' with your submission message
submission_message = 'DL model try'

# Call the submit function from kaggle package
#kaggle.api.competition_submit(submission_file, submission_message, competition='nlp-getting-started')