In [72]:
import re
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [73]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Data preparation and exploration

In [None]:
filename = "emotion_dataset.txt"

# Read the file
with open(filename, 'r') as file:
    lines = file.readlines()  

# Each line as a json dictionary
dataset = [json.loads(line) for line in lines]

df = pd.DataFrame(dataset)

# Mostrar las primeras filas del DataFrame
df.head()

In [None]:
# Check missing values
print("\nMissing values per colummn:")
print(df.isnull().sum())

In [None]:
# Data distribution
class_distribution = df['label'].value_counts().sort_index()

class_distribution.plot(kind='bar', color='skyblue')
plt.title('Distribution of the Emotion variable')
plt.xlabel('Emotion')
plt.ylabel('Number of examples')
plt.show()

In [None]:
# Text lengths
df['text_length'] = df['text'].apply(len)

print("\nStadistics:")
print(df['text_length'].describe())

plt.hist(df['text_length'], bins=30, color='lightgreen', edgecolor='black')
plt.title('Text length distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
X = df["text"]
y = df["label"]

#temporal variable to divide in test and validation
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=123)

#test and validation
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=123)

X_train_half, X_temp_half, y_train_half, y_temp_half = train_test_split(X[:int(X.shape[0]/2)], y[:int(y.shape[0]/2)], test_size=0.3, random_state=123)
X_test_half, X_val_half, y_test_half, y_val_half = train_test_split(X_temp_half, y_temp_half, test_size=0.5, random_state=123)

# Sizings
print(f"Training data size: {len(X_train)}")
print(f"Validation data size: {len(X_val)}")
print(f"Test data size: {len(X_test)}")

print(f"Training data size: {len(X_train_half)}")
print(f"Validation data size: {len(X_val_half)}")
print(f"Test data size: {len(X_test_half)}")


In [None]:
# Vectorizer TF-IDF. Text format
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

X_train_half = vectorizer.fit_transform(X_train_half)
X_val_half = vectorizer.transform(X_val_half)
X_test_half = vectorizer.transform(X_test_half)

print("Training data TF-IDF: ")
print(X_train)
print(f"Validation data TF-IDF: {X_val}")
print(f"Test data TF-IDF: {X_test.shape}")

In [None]:
# Labels format
print(f"Tipo de y_train: {type(y_train.iloc[0])}")

y_train = y_train.astype(int)
y_val = y_val.astype(int)
y_test = y_test.astype(int)

y_train_half = y_train_half.astype(int)
y_val_half = y_val_half.astype(int)
y_test_half = y_test_half.astype(int)


# Basic machine learning

In [81]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Majority classifier
predicted_labels = [y_train.value_counts().idxmax()]*len(y_test)
print("Majority classifier Accuracy:")
MC_accuracy_score = accuracy_score(y_test, predicted_labels)
print(MC_accuracy_score)
print('Classification Report:')
print(classification_report(y_test, predicted_labels))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predicted_labels))

In [None]:
# Majority classifier with half the dataset
predicted_labels = [y_train_half.value_counts().idxmax()]*len(y_test_half)
print("Majority classifier Accuracy:")
MC_accuracy_score_half = accuracy_score(y_test_half, predicted_labels)
print(MC_accuracy_score_half)
print('Classification Report:')
print(classification_report(y_test_half, predicted_labels))
print('Confusion Matrix:')
print(confusion_matrix(y_test_half, predicted_labels))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
predicted_knn = knn.predict(X_test)
print("KNN Accuracy:")
KNN_accuracy_score = accuracy_score(y_test, predicted_knn)
print(KNN_accuracy_score)
print('Classification Report:')
print(classification_report(y_test, predicted_knn))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predicted_knn))

In [None]:
# KNN with half the dataset
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_half, y_train_half)
predicted_knn = knn.predict(X_test_half)
print("KNN Accuracy:")
KNN_accuracy_score_half = accuracy_score(y_test_half, predicted_knn)
print(KNN_accuracy_score_half)
print('Classification Report:')
print(classification_report(y_test_half, predicted_knn))
print('Confusion Matrix:')
print(confusion_matrix(y_test_half, predicted_knn))

In [None]:
# SVM
svm = SVC(C=100)
svm.fit(X_train, y_train)
predicted_svm = svm.predict(X_test)
print("SVM Accuracy:")
SVM_accuracy_score = accuracy_score(y_test, predicted_svm)
print(SVM_accuracy_score)
print('Classification Report:')
print(classification_report(y_test, predicted_svm))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predicted_svm))


In [None]:
# SVM with half the dataset
svm = SVC(C=100)
svm.fit(X_train_half, y_train_half)
predicted_svm = svm.predict(X_test_half)
print("SVM Accuracy:")
SVM_accuracy_score_half = accuracy_score(y_test_half, predicted_svm)
print(SVM_accuracy_score_half)
print('Classification Report:')
print(classification_report(y_test_half, predicted_svm))
print('Confusion Matrix:')
print(confusion_matrix(y_test_half, predicted_svm))


In [None]:
# Decision Trees
dt = DecisionTreeClassifier(max_depth=10, min_samples_split=25)
dt.fit(X_train, y_train)
predicted_dt = dt.predict(X_test)
print("Decision Tree Accuracy:")
DT_accuracy_score = accuracy_score(y_test, predicted_dt)
print(DT_accuracy_score)
print('Classification Report:')
print(classification_report(y_test, predicted_dt))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predicted_dt))

In [None]:
# Decision Trees with half the dataset
dt = DecisionTreeClassifier(max_depth=10, min_samples_split=25)
dt.fit(X_train_half, y_train_half)
predicted_dt = dt.predict(X_test_half)
print("Decision Tree Accuracy:")
DT_accuracy_score_half = accuracy_score(y_test_half, predicted_dt)
print(DT_accuracy_score_half)
print('Classification Report:')
print(classification_report(y_test_half, predicted_dt))
print('Confusion Matrix:')
print(confusion_matrix(y_test_half, predicted_dt))

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
predicted_rf = rf.predict(X_test)
print("Random Forest Accuracy:")
RF_accuracy_score = accuracy_score(y_test, predicted_rf)
print(RF_accuracy_score)
print('Classification Report:')
print(classification_report(y_test, predicted_rf))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predicted_rf))

In [None]:
# Random Forest with half the dataset
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train_half, y_train_half)
predicted_rf = rf.predict(X_test_half)
print("Random Forest Accuracy:")
RF_accuracy_score_half = accuracy_score(y_test_half, predicted_rf)
print(RF_accuracy_score_half)
print('Classification Report:')
print(classification_report(y_test_half, predicted_rf))
print('Confusion Matrix:')
print(confusion_matrix(y_test_half, predicted_rf))

In [None]:
# Bagging
bc = BaggingClassifier()
bc.fit(X_train, y_train)
predicted_bc = bc.predict(X_test)
print("Bagging Accuracy:")
Bagging_accuracy_score = accuracy_score(y_test, predicted_bc)
print(Bagging_accuracy_score)
print('Classification Report:')
print(classification_report(y_test, predicted_bc))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predicted_bc))

In [None]:
# Bagging with half the dataset
bc = BaggingClassifier()
bc.fit(X_train_half, y_train_half)
predicted_bc = bc.predict(X_test_half)
print("Bagging Accuracy:")
Bagging_accuracy_score_half = accuracy_score(y_test_half, predicted_bc)
print(Bagging_accuracy_score_half)
print('Classification Report:')
print(classification_report(y_test_half, predicted_bc))
print('Confusion Matrix:')
print(confusion_matrix(y_test_half, predicted_bc))

In [None]:
# Gradiant Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
predicted_gb = gb.predict(X_test)
print("Gradiant Boosting Accuracy:")
GB_accuracy_score = accuracy_score(y_test, predicted_gb)
print(GB_accuracy_score)
print('Classification Report:')
print(classification_report(y_test, predicted_gb))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predicted_gb))

In [None]:
# Gradiant Boosting with half the dataset
gb = GradientBoostingClassifier()
gb.fit(X_train_half, y_train_half)
predicted_gb = gb.predict(X_test_half)
print("Gradiant Boosting Accuracy:")
GB_accuracy_score_half = accuracy_score(y_test_half, predicted_gb)
print(GB_accuracy_score_half)
print('Classification Report:')
print(classification_report(y_test_half, predicted_gb))
print('Confusion Matrix:')
print(confusion_matrix(y_test_half, predicted_gb))

In [None]:
import matplotlib.pyplot as plt

model_names = [
    'MultinomialNB', 
    'KNN', 
    'SVM', 
    'Decision Tree', 
    'Random Forest', 
    'Bagging', 
    'Gradient Boosting'
]

# Accuracy scores for each model
accuracy_scores = [
    MC_accuracy_score, 
    KNN_accuracy_score, 
    SVM_accuracy_score, 
    DT_accuracy_score, 
    RF_accuracy_score, 
    Bagging_accuracy_score, 
    GB_accuracy_score
]


plt.figure(figsize=(10, 8))
plt.bar(model_names, accuracy_scores, color='skyblue')
plt.title('Comparison of Accuracy Scores between Models')
plt.xlabel('Models')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1)
plt.xticks(rotation=45)  
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

model_names = [
    'MultinomialNB', 
    'KNN', 
    'SVM', 
    'Decision Tree', 
    'Random Forest', 
    'Bagging', 
    'Gradient Boosting'
]

# Accuracy scores for each model
accuracy_scores = [
    MC_accuracy_score_half, 
    KNN_accuracy_score_half, 
    SVM_accuracy_score_half, 
    DT_accuracy_score_half, 
    RF_accuracy_score_half, 
    Bagging_accuracy_score_half, 
    GB_accuracy_score_half
]


plt.figure(figsize=(10, 8))
plt.bar(model_names, accuracy_scores, color='skyblue')
plt.title('Comparison of Accuracy Scores between Models with half the Dataset')
plt.xlabel('Models')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1)
plt.xticks(rotation=45)  
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

model_names = [
    'MultinomialNB', 
    'KNN', 
    'SVM', 
    'Decision Tree', 
    'Random Forest', 
    'Bagging', 
    'Gradient Boosting'
]

# Difference in Accuracy scores for each model and datasets
accuracy_scores = [
    MC_accuracy_score - MC_accuracy_score_half, 
    KNN_accuracy_score - KNN_accuracy_score_half, 
    SVM_accuracy_score - SVM_accuracy_score_half, 
    DT_accuracy_score - DT_accuracy_score_half, 
    RF_accuracy_score - RF_accuracy_score_half, 
    Bagging_accuracy_score - Bagging_accuracy_score_half, 
    GB_accuracy_score - GB_accuracy_score_half
]

print(accuracy_scores)


plt.figure(figsize=(10, 8))
plt.bar(model_names, accuracy_scores, color='skyblue')
plt.title('Comparison of Difference in Accuracy Scores between Models over Datasets')
plt.xlabel('Models')
plt.ylabel('Accuracy Score - Accuracy Score over Half')
plt.ylim(-0.1, 0.2)
plt.xticks(rotation=45)  
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

## Cross validation and Hyperparameter Tuning
We will implement cross validation and Hyperparameter tuning for selected base models which already show promissing results (hight accuracy and consistent confussion matrix) so:  
KNN,  
Random Forest,  
Bagging  

In [99]:
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Cross-validation implementation and K Hyperparameter tuning for KNN

# Create a copy of training data
X_CV = X_train.copy()
y_CV = y_train.copy()

# Define the number of folds
folds = 3

# Initialize variables for storing evaluation results and overall scores
eval_core = []
overall_scores = []

# We will check K in the range from 1 to 10
for neigh in range(1, 11):

    # Initialize variables to store fold-specific evaluation metrics
    fold_eval = []

    # Initialize k-fold cross-validation
    skf = StratifiedKFold(n_splits=folds, shuffle=False)

    for i, (train_index, val_index) in enumerate(skf.split(X_CV, y_CV)):
        # Select data from train and val based on fold indices
        X_CV_train, X_CV_val = X_CV[train_index], X_CV[val_index]
        y_CV_train, y_CV_val = y_CV.iloc[train_index], y_CV.iloc[val_index]

        # Train a KNN classifier with the given parameter k
        knn = KNeighborsClassifier(n_neighbors=neigh)
        knn.fit(X_CV_train, y_CV_train)

        # Predict on the val set
        y_CV_pred = knn.predict(X_CV_val)

        # Compute the accuracy metric for this fold
        accuracy = accuracy_score(y_CV_val, y_CV_pred)
        fold_eval.append(accuracy)

    # Calculate the mean performance across all folds for this k
    mean_accuracy = np.mean(fold_eval)
    print(f'n_neighbours_{neigh}:', mean_accuracy)

    # Append the mean accuracy to the overall scores
    overall_scores.append(mean_accuracy)

# Diagnostic
print('Overall scores:', overall_scores)

# Find the best performing k based on the highest accuracy
best_k = np.argmax(overall_scores) + 1  # Add 1 to convert to 1-based index
print("Best K for KNN:", best_k)

In [None]:
import matplotlib.pyplot as plt

# accuracy of Knn with diffent K's
Ks = (np.arange(0, 10) + 1)

# Accuracy scores for each model
accuracy_scores = overall_scores

plt.figure(figsize=(10, 8))
plt.bar(Ks, accuracy_scores, color='skyblue')
plt.title('Comparison of Accuracy Scores between differnet K')
plt.xlabel('Number of K')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1)
plt.xticks(rotation=45)  
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Random Forest Hyperparameter tuning using Grid Search with cross validation throught gridsearch Object

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [10, 50, 100, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 20, 50],  # Adjust the range as needed
    'min_samples_split': [10, 30],
    'min_samples_leaf': [1, 10],    
    'max_features': ['sqrt', 'log2', None]
}

# Create a grid search object
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to your training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a Random Forest classifier with the best hyperparameters
best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train, y_train)

# Make predictions on your test data
predicted_rf = best_rf.predict(X_val)

# Calculate the accuracy of the tuned Random Forest
accuracy_rf = accuracy_score(y_val, predicted_rf)
print("Tuned Decision Tree Accuracy:", accuracy_rf)
print("Best Hyperparameters:", best_params)

In [None]:
# Final evaluation
predicted_test = best_rf.predict(X_test)
accuracy_test = accuracy_score(y_test, predicted_test)
print("Accuracy en conjunto de prueba:", accuracy_test)
print("Best Hyperparameters:", best_params)

In [None]:
# Random Search for Random Forest with cross validation
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [10, 50, 100],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 20, 50]
}

random_search_rf = RandomizedSearchCV(RandomForestClassifier(), param_dist, n_iter=10, cv=3)
random_search_rf.fit(X_val, y_val)
best_rf = random_search_rf.best_estimator_
predicted_best_rf = best_rf.predict(X_test)
accuracy_best_rf = accuracy_score(y_test, predicted_best_rf)
print("Best Random Forest Accuracy:", accuracy_best_rf)
print(random_search_rf.best_params_)

In [None]:
accuracy_rf = 0.844
results = [RF_accuracy_score, accuracy_rf, accuracy_best_rf]
classifiers = ['Base', 'Grid Search', 'Random Search']

# Sort results and classifiers in ascending order of accuracy
sorted_results, sorted_classifiers = zip(*sorted(zip(results, classifiers)))

# Create a bar plot
plt.barh(sorted_classifiers, sorted_results, color='skyblue')
plt.xlabel('Accuracy')
plt.title('Random Forest Accuracy Comparison (Low to High)')
plt.xlim(0, 1.0)  # Set the x-axis limits from 0 to 1 for accuracy values

# Annotate the bars with accuracy values
for i, result in enumerate(sorted_results):
    plt.text(result + 0.01, i, f'{result:.2f}', va='center', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Hyperparameter tuning for Bagging using Grid Search with cross validation
param_grid = {
              'n_estimators': [10, 20, 30],
              'max_samples': [0.8, 1, 2],
              'max_features': [0.5, 1],
              'bootstrap': [True, False],
              'bootstrap_features': [True, False],
              'warm_start': [True, False]
}

bc = BaggingClassifier()

grid_search = GridSearchCV(bc, param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_bc = grid_search.best_estimator_
predicted_best_bc = best_bc.predict(X_val)
accuracy_best_bc = accuracy_score(y_val, predicted_best_bc)
print("Best Bagging Accuracy:", accuracy_best_bc)
print("Best Hyperparameters:", grid_search.best_params_)

In [None]:
# Final evaluation
predicted_test = best_bc.predict(X_test)
accuracy_test = accuracy_score(y_test, predicted_test)
print("Accuracy en conjunto de prueba:", accuracy_test)

In [None]:
results = [Bagging_accuracy_score, accuracy_test]
classifiers = ['Base', 'Grid Search']

# Sort results and classifiers in ascending order of accuracy
sorted_results, sorted_classifiers = zip(*sorted(zip(results, classifiers)))

# Create a bar plot
plt.barh(sorted_classifiers, sorted_results, color='skyblue')
plt.xlabel('Accuracy')
plt.title('Bagging Accuracy Comparison (Low to High)')
plt.xlim(0, 1.0)  # Set the x-axis limits from 0 to 1 for accuracy values

# Annotate the bars with accuracy values
for i, result in enumerate(sorted_results):
    plt.text(result + 0.01, i, f'{result:.2f}', va='center', fontsize=12)

plt.tight_layout()
plt.show()

# Advanced Machine learning

## Mixing models

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
from datasets import load_dataset

ds = load_dataset("dair-ai/emotion", "unsplit")

In [None]:
# Convertir el split 'train' a un DataFrame de Pandas
df = pd.DataFrame(ds['train'])
df.shape

In [None]:
# Data distribution
class_distribution = df['label'].value_counts().sort_index()

class_distribution.plot(kind='bar', color='skyblue')
plt.title('Distribution of the Emotion variable')
plt.xlabel('Emotion')
plt.ylabel('Number of examples')
plt.show()

In [None]:
# downsamplig label 0 and 1
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]
df_other_labels = df[df['label'].isin([2, 3, 4, 5])]

df_label_0_downsampled = df_label_0.sample(n=60000, random_state=42)
df_label_1_downsampled = df_label_1.sample(n=60000, random_state=42)

df_balanced = pd.concat([df_label_0_downsampled, df_label_1_downsampled, df_other_labels], ignore_index=True)

print(df_balanced['label'].value_counts())

texts = df["text"]
labels = df["label"]

# Create TF-IDF vectorizer with optimized parameters
tfidf = TfidfVectorizer(
    max_features=10000,
    min_df=2,
    max_df=0.9,
    ngram_range=(1, 3),
    strip_accents='unicode',
    stop_words='english',
    sublinear_tf=True
)

# Transform texts to TF-IDF features
X = tfidf.fit_transform(texts)

# Split the data with a smaller test size
X_train, X_test, y_train, y_test = train_test_split(
    X, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

In [None]:
models = {
    "knn": KNeighborsClassifier(n_neighbors = best_k,),
    "rf": best_rf,
    "bagging": bc
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")


final_predictions = (
    0.7 * models['rf'].predict(X_test) +
    0.2 * models['bagging'].predict(X_test) +
    0.1 * models['knn'].predict(X_test)
)
final_predictions = (final_predictions > 0.5).astype(int)  # Convert to binary predictions


In [None]:
# Ensemble Results
print("\nENSEMBLE MODEL PERFORMANCE:")
print("-" * 50)
print(f"Accuracy: {accuracy_score(y_test, final_predictions):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, final_predictions))


## Neural Networks and Deep Learning

### Recurrent Neural Networks with Keras  

RNNs are particularly effective for sequence data, such as text, where the order of words matters. These models process words one by one, maintaining a memory of previous words, which allows them to capture dependencies in the data.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [8]:
import pandas as pd

df = pd.read_parquet("hf://datasets/dair-ai/emotion/unsplit/train-00000-of-00001.parquet")

In [None]:
# downsamplig label 0 and 1
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]
df_other_labels = df[df['label'].isin([2, 3, 4, 5])]

df_label_0_downsampled = df_label_0.sample(n=60000, random_state=42)
df_label_1_downsampled = df_label_1.sample(n=60000, random_state=42)

df_balanced = pd.concat([df_label_0_downsampled, df_label_1_downsampled, df_other_labels], ignore_index=True)

print(df_balanced['label'].value_counts())

X = df_balanced["text"]  # Use the 'text' column from df_balanced
y = df_balanced["label"]
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=df_balanced["label"], random_state=123)

# Sizing
print(f"Training data size: {len(X_train)}")
print(f"Test data size: {len(X_test)}")

# Tokenization parameters
max_words = 20000  # Vocabulary size
max_len = 100       # Maximum sequence length

# Initialize and fit the tokenizer on the training data
tokenizer = Tokenizer(num_words=max_words, oov_token='<UNK>')
tokenizer.fit_on_texts(X_train)

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_seq = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_seq = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')



# Print confirmation
print(f"Training data shape: {X_train_seq.shape}")
print(f"Test data shape: {X_test_seq.shape}")
print(f"Type of y_train: {type(y_train.iloc[0])}")


In [None]:
# Convert labels to categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(max_len,)),
    tf.keras.layers.Embedding(20000, 100, input_length=max_len),  # Increased embedding size
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=False)),  # Increased LSTM size
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),  # Added dense layer
    tf.keras.layers.Dense(6, activation='softmax')  # Output layer
])

# Compile with lower learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

model.summary()

# Early stopping with more patience
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor val_loss instead
    patience=5,  # More patience to avoid early stopping too soon
    restore_best_weights=True
)

# Learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5, min_lr=1e-7)

# Train with slightly different parameters
history = model.fit(
    X_train_seq, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, lr_scheduler]
)


In [None]:
# Evaluate
loss, accuracy = model.evaluate(X_test_seq, y_test)
print(f"\nTest accuracy: {accuracy:.4f}")

In [None]:
plt.figure(figsize=(12, 4))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Test Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

Results are available in the README file: https://github.com/sofiaamartinezz/IS_Seminar2