In [None]:
!pip install tensorflow numpy pandas scikit-learn
import time
import pandas as pd
import numpy as np
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report




In [None]:


train_data = pd.read_csv('train_es.tsv', sep='\t')
dev_data = pd.read_csv('dev_es.tsv', sep='\t')

# Assuming the data has columns 'text' for the tweet and 'label' for the classification
train_texts = train_data['text'].values
train_labels = train_data['HS'].values  # Use 'HS' as the label
dev_texts = dev_data['text'].values
dev_labels = dev_data['HS'].values

# Tokenization and Padding (as described in previous responses)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
dev_sequences = tokenizer.texts_to_sequences(dev_texts)
max_length = max([len(x) for x in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1


In [None]:
model = Sequential([
    Embedding(vocab_size, 100, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(5),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1097, 100)         1930200   
                                                                 
 conv1d (Conv1D)             (None, 1093, 128)         64128     
                                                                 
 max_pooling1d (MaxPooling1  (None, 218, 128)          0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 27904)             0         
                                                                 
 dense (Dense)               (None, 64)                1785920   
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                        

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
start_time = time.time()
history = model.fit(
    train_padded, train_labels,
    epochs=10,
    validation_data=(dev_padded, dev_labels),
    callbacks=[early_stopping]
)
end_time = time.time()
total_time = end_time - start_time

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score

# Predict on dev set
dev_predictions = model.predict(dev_padded)
dev_predictions = [1 if prob > 0.5 else 0 for prob in dev_predictions]

# Calculate Accuracy, F1 Score, and Precision
accuracy = accuracy_score(dev_labels, dev_predictions)
f1 = f1_score(dev_labels, dev_predictions, average='binary')  # For binary classification
precision = precision_score(dev_labels, dev_predictions, average='binary')  # For binary classification
plot_model(model, to_file="/content/cnn.png", show_shapes=True, show_dtype=False, show_layer_names=False, show_trainable=True, show_layer_activations=True)
print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Time: {total_time}')

Accuracy: 0.758
F1 Score: 0.7584830339321356
Precision: 0.6810035842293907
Time: 262.25177478790283
