In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

# Load the dataset
df = pd.read_csv('final.csv')

# Ensure all entries in 'tweet' column are strings and handle NaNs
df['tweet'] = df['tweet'].astype(str).fillna('')

# Tokenize and pad the sequences
max_features = 2000
max_length = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['tweet'])
X = tokenizer.texts_to_sequences(df['tweet'])
X = pad_sequences(X, maxlen=max_length)

# Encode the labels
encoder = LabelEncoder()
y = encoder.fit_transform(df['class'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=hp.Int('embedding_output_dim', min_value=32, max_value=128, step=32), input_length=max_length))
    model.add(LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32)))
    model.add(Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(3, activation='softmax'))  # Assuming 3 classes: Normal, Offensive, Hate

    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-3, 1e-4])),
                  loss='sparse_categorical_crossentropy',
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    return model

# Hyperparameter tuning
tuner = kt.Hyperband(build_model,
                     objective='val_sparse_categorical_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='text_classification')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

tuner.search(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the optimal hyperparameters
model = build_model(best_hps)

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1}')


Trial 30 Complete [00h 02m 54s]
val_sparse_categorical_accuracy: 0.8983862996101379

Best val_sparse_categorical_accuracy So Far: 0.9069591760635376
Total elapsed time: 00h 50m 57s
Epoch 1/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 36ms/step - loss: 0.5839 - sparse_categorical_accuracy: 0.8028 - val_loss: 0.3322 - val_sparse_categorical_accuracy: 0.8898
Epoch 2/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 35ms/step - loss: 0.3083 - sparse_categorical_accuracy: 0.8970 - val_loss: 0.2936 - val_sparse_categorical_accuracy: 0.9002
Epoch 3/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 38ms/step - loss: 0.2632 - sparse_categorical_accuracy: 0.9082 - val_loss: 0.2764 - val_sparse_categorical_accuracy: 0.9012
Epoch 4/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 42ms/step - loss: 0.2226 - sparse_categorical_accuracy: 0.9240 - val_loss: 0.2932 - val_sparse_categorical_accuracy: 0.9004
Epoch 5/10


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

# Load the dataset
df = pd.read_csv('final.csv')

# Ensure all entries in 'tweet' column are strings and handle NaNs
df['tweet'] = df['tweet'].astype(str).fillna('')

# Tokenize and pad the sequences
max_features = 2000
max_length = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['tweet'])
X = tokenizer.texts_to_sequences(df['tweet'])
X = pad_sequences(X, maxlen=max_length)

# Encode the labels
encoder = LabelEncoder()
y = encoder.fit_transform(df['class'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=hp.Int('embedding_output_dim', min_value=32, max_value=128, step=32), input_length=max_length))
    model.add(Bidirectional(LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32))))
    model.add(Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(3, activation='softmax'))  # Assuming 3 classes: Normal, Offensive, Hate

    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-3, 1e-4])),
                  loss='sparse_categorical_crossentropy',
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    return model

# Hyperparameter tuning
tuner = kt.Hyperband(build_model,
                     objective='val_sparse_categorical_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='text_classification')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

tuner.search(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the optimal hyperparameters
model = build_model(best_hps)

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1}')


Reloading Tuner from my_dir\text_classification\tuner0.json
Epoch 1/10




[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 55ms/step - loss: 0.5554 - sparse_categorical_accuracy: 0.8137 - val_loss: 0.3001 - val_sparse_categorical_accuracy: 0.8984
Epoch 2/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 51ms/step - loss: 0.2828 - sparse_categorical_accuracy: 0.9013 - val_loss: 0.2746 - val_sparse_categorical_accuracy: 0.9037
Epoch 3/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 63ms/step - loss: 0.2280 - sparse_categorical_accuracy: 0.9212 - val_loss: 0.2872 - val_sparse_categorical_accuracy: 0.9009
Epoch 4/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 71ms/step - loss: 0.2042 - sparse_categorical_accuracy: 0.9277 - val_loss: 0.3044 - val_sparse_categorical_accuracy: 0.8933
Epoch 5/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 55ms/step - loss: 0.1896 - sparse_categorical_accuracy: 0.9349 - val_loss: 0.3229 - val_sparse_categorical_accuracy: 0.8860
E

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

# Load the dataset
df = pd.read_csv('final.csv')

# Ensure all entries in 'tweet' column are strings and handle NaNs
df['tweet'] = df['tweet'].astype(str).fillna('')

# Tokenize and pad the sequences
max_features = 2000
max_length = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['tweet'])
X = tokenizer.texts_to_sequences(df['tweet'])
X = pad_sequences(X, maxlen=max_length)

# Encode the labels
encoder = LabelEncoder()
y = encoder.fit_transform(df['class'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=hp.Int('embedding_output_dim', min_value=32, max_value=128, step=32), input_length=max_length))
    model.add(Conv1D(filters=hp.Int('filters', min_value=32, max_value=128, step=32), kernel_size=hp.Choice('kernel_size', values=[3, 5, 7]), activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(3, activation='softmax'))  # Assuming 3 classes: Normal, Offensive, Hate

    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-3, 1e-4])),
                  loss='sparse_categorical_crossentropy',
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    return model

# Hyperparameter tuning
tuner = kt.Hyperband(build_model,
                     objective='val_sparse_categorical_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='text_classification')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

tuner.search(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the optimal hyperparameters
model = build_model(best_hps)

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1}')


Reloading Tuner from my_dir\text_classification\tuner0.json
Epoch 1/10




[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.6507 - sparse_categorical_accuracy: 0.7588 - val_loss: 0.2895 - val_sparse_categorical_accuracy: 0.9072
Epoch 2/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.3004 - sparse_categorical_accuracy: 0.8992 - val_loss: 0.2689 - val_sparse_categorical_accuracy: 0.9024
Epoch 3/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.2479 - sparse_categorical_accuracy: 0.9157 - val_loss: 0.2703 - val_sparse_categorical_accuracy: 0.9014
Epoch 4/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.2139 - sparse_categorical_accuracy: 0.9244 - val_loss: 0.2784 - val_sparse_categorical_accuracy: 0.9019
Epoch 5/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.1933 - sparse_categorical_accuracy: 0.9338 - val_loss: 0.2979 - val_sparse_categorical_accuracy: 0.8999
Epoch 6/10
