In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nlp-getting-started:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F17777%2F869809%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240318%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240318T201711Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1e1dae07e6c356803660d8429337abd77779cce49b3d53f26a507590f068622c12e551da37b3a45895a70eeecf2ab6be6745242210faedad671ffd78e9704e25a211926f78a224c21753bd8b4f17ea46bde7e11f4fde0649c7ce2ef87619fd903d9ff8f2aa1385e97d8b434a661b54addbfdddcffe9f354f9fef15b02c6ae331efadbc39bed93c4a5b715476d9b69eeeab01e39e8de744221675f9eba58f622f9e4181baf410c9a34c771066f5222f51981a12afc729fca9325e72d954af87dc48cfd0f809578959ecb223729d09112251cfa488948e68574053c3415d5bb7e596c8f14cd22b1fe4e5dc3a0c6ebf8d5a3a2fc8a72cb6e5d48437f709430f163e,distil_bert/keras/distil_bert_base_en_uncased/2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F4689%2F6068%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240318%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240318T201711Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6159248b8c57eac44923e500d624f5d5ab8ef4b4c85eb049d1999748b360a46d666872262b26ebb56f58f108f338afe2d3d4484f72cf69455bd5978d58df0ae74f501b63d73ac2465456fcfeea2d289a20073f6ccb9ed599d6207cd0a46de9491295cca406218432cdebc5e76e68d9d4f35f95e3176476ee5e2b884076aee97e338068049b1f0f3041cffb439da384388dd42c5bfc318cfbcc83d3112e441cf197be71d9b4684486cbac3265f35f0c78342e3a25d1e697a7bc613b43654f2a41ed6c02dbaf5b410781df3c6c039928234e2680df2b1f5133a285d0114d52741a8feee1270ccb7ef195f2ee29c0b402e52344f2bd253fdf4c9e1d4e4d1d972b8f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading nlp-getting-started, 607343 bytes compressed
Downloaded and uncompressed: nlp-getting-started
Downloading distil_bert/keras/distil_bert_base_en_uncased/2, 245289341 bytes compressed
Downloaded and uncompressed: distil_bert/keras/distil_bert_base_en_uncased/2
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install keras-core --upgrade
!pip install keras-nlp --upgrade

In [None]:
import tensorflow as tf
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import keras_nlp
import matplotlib.pyplot as plt
import keras_core as keras
import seaborn as sns
from keras.optimizers import Adam


In [None]:
train_df=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
test_df.head()

In [None]:
train_df["length"]=train_df["text"].apply( lambda x:len(x))
test_df["length"]=test_df["text"].apply( lambda x:len(x))

In [None]:
train_df["length"]

In [None]:
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
plt.hist(train_df["length"],bins=50,color='red', edgecolor='black')
plt.title('Length of the Training Data')
plt.xlabel('Length')
plt.ylabel('Frequency')


plt.figure(figsize=(10,6))
plt.subplot(1,2,2)
plt.hist(test_df["length"],bins=50,color='blue', edgecolor='black')
plt.title('Length of the Testing  Data')
plt.xlabel('Length')
plt.ylabel('Frequency')

plt.tight_layout()

plt.show()

In [None]:
train_df.shape[0]

In [None]:
batch_size=32
num_training_examples=train_df.shape[0]
train_split=0.8

steps=int(num_training_examples)*train_split//batch_size

epochs=2
auto=tf.data.experimental.AUTOTUNE


In [None]:
X=train_df["text"]
y=train_df["target"]

X_train,X_val,Y_train,Y_val=train_test_split(X,y,test_size=0.2,random_state=42)

X_test=test_df["text"]

In [None]:
preset="/kaggle/input/distil_bert/keras/distil_bert_base_en_uncased/2"

preprocessor=keras_nlp.models.DistilBertPreprocessor.from_preset(preset,sequence_length=256,name="preprocessor_4_tweets")

classifier=keras_nlp.models.DistilBertClassifier.from_preset(preset,preprocessor=preprocessor,num_classes=2)

classifier.summary()

In [None]:
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer=Adam(1e-5),
    metrics= ["accuracy"]
)


history = classifier.fit(x=X_train,
                         y=Y_train,
                         batch_size=batch_size,
                         epochs=epochs,
                         validation_data=(X_val, Y_val)
                        )

In [None]:
history_data = history.history

history_df = pd.DataFrame(history_data)

# Plot training and validation accuracy
plt.figure(figsize=(10, 5))
sns.lineplot(data=history_df[['accuracy', 'val_accuracy']], markers=True)
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training', 'Validation'])
plt.show()

# Plot training and validation loss
plt.figure(figsize=(5, 5))
sns.lineplot(data=history_df[['loss', 'val_loss']], markers=True)
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Training', 'Validation'])
plt.show()

In [None]:
y_pred_train = classifier.predict(X_train)

tn, fp, fn, tp = confusion_matrix(Y_train, np.argmax(y_pred_train, axis=1)).ravel()
f1_score = tp / (tp+((fn+fp)/2))

In [None]:
print(f1_score)

In [None]:
sample_df=pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
sample_df.head()

In [None]:
sample_df["target"]=np.argmax(classifier.predict(X_test),axis=1)

In [None]:
sample_df.to_csv("sample_df.csv",index=False)

In [None]:
sample_df.describe()