In [None]:
import tensorflow as tf

# Print the version of TensorFlow
print("TensorFlow version:", tf.__version__)

# Check if TensorFlow can access the GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("GPUs available:", gpus)
else:
    print("No GPU available.")

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import keras
from tqdm import tqdm
from chromadb.utils import embedding_functions
# sent_emb=embedding_functions.SentenceTransformerEmbeddingFunction()
import keras
from keras.models import Sequential
from keras.layers import Dense,Input

import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
# keras.backend

In [None]:
# !pip install sentence_transformers

In [None]:
# # Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_2 = TFBertModel.from_pretrained('bert-base-uncased')

def sent_emb(text):
    inputs=tokenizer(text,return_tensors="tf",padding=True, truncation=True)
    outputs = model_2(inputs)
    return tf.reduce_mean(outputs.last_hidden_state,axis=1)

## Data prep

In [None]:
data=pd.read_csv('nlp-getting-started/train.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data['target'].sum()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_X , val_X = train_test_split(data,test_size=.3,random_state=42)

In [None]:
train_X.shape , val_X.shape

In [None]:
# sent_emb(['kashdkasjd'])

In [None]:
feature_train_X = sent_emb(list(train_X['text']))


In [None]:
label_train_X = train_X[['target']]

In [None]:
feature_val_X = sent_emb(val_X['text'])


In [None]:
label_val_X = val_X[['target']]

In [None]:
feature_train_X_= np.array(feature_train_X)
feature_val_X_= np.array(feature_val_X)

In [None]:
feature_train_X_.shape, feature_val_X_.shape

## Model building

In [None]:
sent_emb(["text"])

In [None]:
elm_len = len(sent_emb(["text"])[0])
elm_len

In [None]:
# model = keras.models.Sequential([
#     keras.layers.Input(shape=(elm_len,1)),
#     keras.layers.Dense(500,activation="relu"),
#     keras.layers.Dense(300,activation="relu"),
#     keras.layers.Dense(200,activation="relu"),
#     keras.layers.Dense(1,activation="sigmoid")
# ])

# model.compile(
#     optimizer=keras.optimizers.Adam(learning_rate=1e-3),
#     loss=keras.losses.BinaryCrossentropy(),
#     metrics=[
#         keras.metrics.BinaryAccuracy(),
#         keras.metrics.FalseNegatives(),
#     ],
# )
# model.summary()

In [None]:
model = Sequential()
model.add(Input(shape=(elm_len,)))
model.add(Dense(elm_len*2, activation='relu'))
model.add(Dense(elm_len, activation='relu'))
# Optional: Add another layer
model.add(Dense(200, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.F1Score(),
        keras.metrics.BinaryAccuracy(),
    ],
)
model.summary()

In [None]:
label_train_X_=np.array(label_train_X).reshape(-1,1)

In [None]:
label_val_X_=np.array(label_val_X).reshape(-1, 1)

In [None]:
feature_train_X_.shape, label_train_X_.shape ,feature_val_X_.shape, label_val_X_.shape

## .fit()

the .fit() function it makes assumptions:

* The available RAM of the computer is enough to do the training.
* Calling the model. fit method for a second time is not going to reinitialize our already trained weights, which means we can actually make  consecutive calls to fit if we want to and then manage it properly.
* Processed data is itself used for training our network and our raw data will only fit into the memory.

In [None]:
model.fit(
    x=feature_train_X_,
    y=label_train_X_,
    batch_size=32,
    epochs=50,
    verbose="auto",
    validation_data=[feature_val_X_,label_val_X_],
)

## train_on_batch

* when available memory is not enough for the complete data 
* train_on_batch allows you to expressly update weights based on a collection of samples you provide, without worrying about the fixed batch size.
* one more example  for RL, controlling calls to model.reset_states()


In [None]:
batch_size=50
num_batches = len(data) // batch_size
epochs=5
epochs,num_batches,batch_size

In [None]:
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    acc_vals=[]
    for i in tqdm(range(num_batches)):
        # Get the batch of texts
        batch_data = data['text'][i * batch_size:(i + 1) * batch_size]
        batch_labels = data['target'][i * batch_size:(i + 1) * batch_size]
        
        # Generate embeddings for this batch using the LLM
        feature_X = np.array(sent_emb(list(batch_data)))
        label_X_=np.array(batch_labels).reshape(-1, 1)
        
        # Train the model on the batch
        loss, f1, acc = model.train_on_batch(feature_X, label_X_)
        acc_vals.append(acc)
        
    print(f'Batch {i + 1}/{num_batches}, Loss: {loss}, Accuracy: {sum(acc_vals)/num_batches}')
        

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [None]:
pred_val=model.predict(feature_val_X_)

In [None]:
fpr, tpr, thresholds = roc_curve(label_val_X_, pred_val)

# Plot ROC curve
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(label_val_X_, pred_val):.2f}')
plt.plot([0, 1], [0, 1], 'k--')  # Random classifier line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.show()

In [None]:
J = tpr - fpr
best_threshold_index = np.argmax(J)
best_threshold = thresholds[best_threshold_index]

# Plot the best threshold
plt.scatter(fpr[best_threshold_index], tpr[best_threshold_index], marker='o', color='red', label=f'Best Threshold: {best_threshold:.2f}')
plt.legend()
plt.show()

In [None]:
test_data=pd.read_csv('nlp-getting-started/test.csv')

In [None]:
test_data.head()

In [None]:
test_features= emb_fun(test_data['text'])

In [None]:
feat=np.array(test_features)

In [None]:
feat.shape

In [None]:
predi = model.predict(feat)

In [None]:
model.predict(feat)

In [None]:
# pd.read_csv('nlp-getting-started/sample_submission.csv')
predi=np.where(predi <.56,0,1)

In [None]:
submission = pd.concat((test_data[['id']],pd.DataFrame(predi,columns=['target'])),axis=1)

In [None]:
submission['target'].sum()

In [None]:
submission.to_csv('first_sub2.csv',index=False)