#### More about this task can be found on the link below:
https://www.kaggle.com/competitions/idc-410-document-classification?rvi=1

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate the features and labels in the training data
train_text = train_data['text']
train_labels = train_data['label']

# Split the training data into training and validation sets
train_text, val_text, train_labels, val_labels = train_test_split(train_text, train_labels, test_size=0.2, random_state=42)

# Print the shape of the training, validation, and test sets
print('Training data shape:', train_text.shape)
print('Validation data shape:', val_text.shape)
print('Test data shape:', test_data.shape)



Training data shape: (960,)
Validation data shape: (240,)
Test data shape: (300, 3)


In [7]:
# Initialize the tokenizer with the maximum number of words to keep
MAX_NUM_WORDS = 10000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

# Fit the tokenizer on the training data
tokenizer.fit_on_texts(train_text)

# Convert the text data to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_text)
val_sequences = tokenizer.texts_to_sequences(val_text)
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Get the maximum sequence length in the training data
MAX_SEQUENCE_LENGTH = max(len(seq) for seq in train_sequences)

# Pad the sequences to the maximum length
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Print the shape of the preprocessed training, validation, and test data
print('Preprocessed training data shape:', train_data.shape)
print('Preprocessed validation data shape:', val_data.shape)
print('Preprocessed test data shape:', test_data.shape)


Preprocessed training data shape: (960, 2364)
Preprocessed validation data shape: (240, 2364)
Preprocessed test data shape: (300, 2364)


In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the train and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Tokenize the text data
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 200
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df['text'].values)
train_sequences = tokenizer.texts_to_sequences(train_df['text'].values)
test_sequences = tokenizer.texts_to_sequences(test_df['text'].values)
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Convert the label data to integers
label_to_int = {'sport': 0, 'business': 1, 'politics': 2, 'tech': 3, 'entertainment': 4}
train_df['label'] = train_df['label'].map(label_to_int)
train_labels = train_df['label'].values

# Split the data into training and validation sets
val_size = 0.1
val_samples = int(len(train_data) * val_size)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=val_size)

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(MAX_NUM_WORDS, 128, input_length=MAX_SEQUENCE_LENGTH),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),  
      
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
EPOCHS = 35
BATCH_SIZE = 32
history = model.fit(train_data, train_labels, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(val_data, val_labels))

# Evaluate the model on the validation set
val_loss, val_acc = model.evaluate(val_data, val_labels)
print('Validation accuracy:', val_acc)

# Make predictions on the test set
test_predictions = model.predict(test_data)
test_predictions = np.argmax(test_predictions, axis=1)

# Map the predicted labels back to their text values
int_to_label = {0: 'sport', 1: 'business', 2: 'politics', 3: 'tech', 4: 'entertainment'}
test_predictions = [int_to_label[i] for i in test_predictions]

# Save the predictions to a CSV file
output_df = pd.DataFrame({'index': test_df['index'], 'label': test_predictions})
output_df.to_csv('test_predictions15.csv', index=False)


Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Validation accuracy: 0.8916666507720947
