In [None]:
import pandas as pd

# Read the CSV file into a Pandas dataframe
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Define a function to remove "a", "an", and "the" from a sentence
def remove_articles(sentence):
    words = sentence.split()
    filtered_words = [word for word in words if word.lower() not in ['a', 'an', 'the']]
    new_sentence = " ".join(filtered_words)
    return new_sentence

# Apply the function to the "text" column of the dataframe to create the "changed_text" column
df_train["changed_text"] = df_train["text"].apply(remove_articles)
df_test["changed_text"] = df_test["text"].apply(remove_articles)

# Save the modified dataframe to a new CSV file
df_train.to_csv("modified_train.csv", index=False)
df_test.to_csv("modified_test.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb

# Load the modified_train.csv file
train_df = pd.read_csv('modified_train.csv')

# Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
train_data = vectorizer.fit_transform(train_df['changed_text'])
train_labels = train_df['label']

# Split the data into training and validation sets
train_feature, val_feature, train_label, val_label = train_test_split(train_data, train_labels, test_size=0.2)

# Convert the data type to float64
train_feature = train_feature.astype(np.float64)
val_feature = val_feature.astype(np.float64)

# Train a LightGBM model
params = {'objective': 'multiclass', 'num_class': 8, 'metric': 'multi_logloss'}
train_set = lgb.Dataset(train_feature, label=train_label)
val_set = lgb.Dataset(val_feature, label=val_label, reference=train_set)
model = lgb.train(params, train_set, valid_sets=[train_set, val_set], num_boost_round=1000, early_stopping_rounds=50)

# Load the modified_test.csv file
test_df = pd.read_csv('modified_test.csv')

# Preprocess the text data using the same CountVectorizer
test_data = vectorizer.transform(test_df['changed_text'])

# Convert the data type to float64
test_feature = test_data.astype(np.float64)

# Use the trained model to make predictions on the test data
predictions = model.predict(test_feature)

# Format the predictions into a submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': predictions.argmax(axis=1)})
submission_df.to_csv('submission.csv', index=False)

# Calculate the macro f1 score for the validation set
val_predictions = model.predict(val_feature)
val_predictions = val_predictions.argmax(axis=1)
val_f1_score = f1_score(val_label, val_predictions, average='macro')
print("Macro F1 score for validation set:", val_f1_score)