In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the labeled training dataset
df_train = pd.read_csv("train_submission.csv")  
# Check for missing values
print(df_train.isnull().sum())  # Shows the number of NaN values per column
df_train.dropna(subset=["Text", "Label"], inplace=True)  # Remove rows with NaN in "Text" or "Label"

# Count occurrences of each class
class_counts = df_train["Label"].value_counts()

# Keep only classes with at least 2 samples
valid_classes = class_counts[class_counts >= 2].index
df_train = df_train[df_train["Label"].isin(valid_classes)]

# Ensure the dataset has the correct columns
if "Text" not in df_train.columns or "Label" not in df_train.columns:
    raise ValueError("Training dataset must have 'Text' and 'Label' columns.")

# Split into 80% training, 20% validation
train_df, valid_df = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train["Label"])

# Convert to FastText format
train_df["formatted"] = "__label__" + train_df["Label"].astype(str) + " " + train_df["Text"]
valid_df["formatted"] = "__label__" + valid_df["Label"].astype(str) + " " + valid_df["Text"]

# Save to FastText format files
train_df["formatted"].to_csv("train_fasttext.txt", index=False, header=False)
valid_df["formatted"].to_csv("valid_fasttext.txt", index=False, header=False)


Usage      0
Text       0
Label    500
dtype: int64


In [10]:
import fasttext

# Train FastText model on training data
model = fasttext.train_supervised(
    input="train_fasttext.txt",
    epoch=175,  # Adjust as needed
    lr=0.5,  # Learning rate (adjustable)
    wordNgrams=2,  # Use bigrams
    verbose=2, 
    minCount=1,  # Minimum word occurrences
    loss='softmax'  # Softmax for classification
)


In [11]:
# Read validation dataset
valid_texts = valid_df["Text"].tolist()
true_labels = valid_df["Label"].tolist()

# Predict labels on validation set
predicted_labels, probabilities = model.predict(valid_texts)

# Convert predicted labels from "__label__X" format to just "X"
predicted_labels = [label[0].replace("__label__", "") for label in predicted_labels]

# Compute Accuracy
accuracy = sum([1 for true, pred in zip(true_labels, predicted_labels) if true == pred]) / len(true_labels)
print(f"Validation Accuracy: {accuracy:.2%}")


Validation Accuracy: 76.11%


In [None]:
import pandas as pd
import fasttext

# Load the test dataset
df_test = pd.read_csv("test_without_labels.csv", usecols=['Text'])

# Ensure test dataset is preserved
df_test = df_test.reset_index(drop=True)
df_test["ID"] = df_test.index + 1  # Assign unique IDs

# Predict labels on the test dataset
predicted_labels, probabilities = model.predict(df_test["Text"].tolist())

# Convert predicted labels from "__label__X" format to just "X"
df_test["Label"] = [label[0].replace("__label__", "") for label in predicted_labels]

# Save predictions to a CSV file
df_test[["ID", "Label"]].to_csv("test_predictions2.csv", index=False)

# Print first few predictions
print(df_test.head())


                                                Text  ID Label
0  Hüttwilen el xe on comune del Canton Turgovia ...   1   ven
1  La leĝo zorgas pri kompenso de nur la plej gra...   2   epo
2               پک اپ پر اپنے ڈرائیور سے پہلے پہنچیں   3   urd
3  Mukmu  Ch'itana mukmu icha Butun nisqaqa nisqa...   4   que
4  Iwe   lon ena fansoun   lupwen ra aleani än Mo...   5   chk
