## Binary Classification

In [None]:
!pip install -q fasttext
import pandas as pd
import fasttext
from sklearn.metrics import f1_score

# Load datasets
train_df = pd.read_csv('Fake_train.csv')
test_df = pd.read_csv('fake_test_binary_with_labels.csv')



# Prepare the data (fastText expects labels to start with '__label__' prefix)
def preprocess_for_fasttext(df, filename):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            # fastText expects the label to be in the form '__label__<label>'
            f.write(f"__label__{row['label']} {row['text']}\n")

# Preprocess and save data in fastText's format
preprocess_for_fasttext(train_df, 'train.ft')
preprocess_for_fasttext(test_df, 'test.ft')

# Train a fastText classifier
model = fasttext.train_supervised(input='train.ft', epoch=10, lr=0.1, neg=5, loss='hs')
                                  # epoch=10,          # Reduce number of epochs to prevent overfitting
                                  # lr=0.05,           # Lower learning rate for more gradual learning
                                  # dim=50,            # Reduce dimension of word vectors (default is 100)
                                  # wordNgrams=1,      # Reduce the size of word n-grams (use 1 for unigrams)
                                  # neg=5,             # Use negative sampling to improve generalization
                                  # loss='hs',         # Use hierarchical softmax (faster and often better for smaller datasets)
                                  # thread=4)          # Use multiple threads for faster training


# Make predictions on train and test data
def predict_fasttext(model, df):
    predictions = []
    for _, row in df.iterrows():
        # Predict the label for the text using the trained model
        labels, _ = model.predict(row['text'])  # fastText outputs tuple (labels, probabilities)
        predictions.append(labels[0].replace('__label__', ''))  # Get the first label and remove the prefix
    return predictions

y_train_pred = predict_fasttext(model, train_df)
y_test_pred = predict_fasttext(model, test_df)

# Calculate macro F1 score
train_f1 = f1_score(train_df['label'], y_train_pred, average='macro')
test_f1 = f1_score(test_df['label'], y_test_pred, average='macro')

# Print the results
print(f"Train Macro F1 Score: {train_f1:.4f}")
print(f"Test Macro F1 Score: {test_f1:.4f}")


Train Macro F1 Score: 0.9957
Test Macro F1 Score: 0.8050


## Multi-class Classification

In [1]:
!pip install -q fasttext
import pandas as pd
import fasttext
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# Load datasets
multi_class_data = pd.read_csv("fake_news_classification_mal_train.csv")
multi_class_data["Label"] = multi_class_data["Label"].apply(lambda x: x.strip())
class_names = sorted(list(multi_class_data["Label"].unique()))
train_df, valid_df = train_test_split(multi_class_data, test_size=0.3, random_state=42)

test_df = pd.read_csv("fake_test_multiclass_labeled.csv")
test_df["Label"] = test_df["Label"].apply(lambda x: x.strip())


train_df.dropna(subset=["News", "Label"], inplace=True)
test_df.dropna(subset=["News", "Label"], inplace=True)
train_df["Label"] = train_df["Label"].apply(lambda x: "_".join(x.strip().split(" ")))
test_df["Label"] = test_df["Label"].apply(lambda x: "_".join(x.strip().split(" ")))

train_df["News"] = train_df["News"].str.replace("\n", " ")
test_df["News"] = test_df["News"].str.replace("\n", " ")
# Prepare the data (fastText expects labels to start with '__label__' prefix)
def preprocess_for_fasttext(df, filename):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            # fastText expects the label to be in the form '__label__<label>'
            f.write(f"__label__{row['Label']} {row['News']}\n")

# Preprocess and save data in fastText's format
preprocess_for_fasttext(train_df, 'train.ft')
preprocess_for_fasttext(test_df, 'test.ft')

# Train a fastText classifier
model = fasttext.train_supervised(input='train.ft', epoch=10, lr=0.1, neg=5, loss='hs')
                                  # epoch=10,          # Reduce number of epochs to prevent overfitting
                                  # lr=0.05,           # Lower learning rate for more gradual learning
                                  # dim=50,            # Reduce dimension of word vectors (default is 100)
                                  # wordNgrams=1,      # Reduce the size of word n-grams (use 1 for unigrams)
                                  # neg=5,             # Use negative sampling to improve generalization
                                  # loss='hs',         # Use hierarchical softmax (faster and often better for smaller datasets)
                                  # thread=4)          # Use multiple threads for faster training


print(train_df.head())
print(test_df.head())
# Make predictions on train and test data
def predict_fasttext(model, df):
    predictions = []
    for _, row in df.iterrows():
        # Predict the label for the text using the trained model
        # print("\n"+row['News'])
        labels, _ = model.predict(row['News'].strip())  # fastText outputs tuple (labels, probabilities)
        predictions.append(labels[0].replace('__label__', ''))  # Get the first label and remove the prefix
    return predictions

y_train_pred = predict_fasttext(model, train_df)
y_test_pred = predict_fasttext(model, test_df)

# Calculate macro F1 score
train_f1 = f1_score(train_df['Label'], y_train_pred, average='macro')
test_f1 = f1_score(test_df['Label'], y_test_pred, average='macro')

# Print the results
print(f"Train Macro F1 Score: {train_f1:.4f}")
print(f"Test Macro F1 Score: {test_f1:.4f}")


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
                    ID                                               News  \
1565  FAKE_MAL_TR_1566       മഞ്ഞ് ഉരുകുന്നില്ല, കറുത്തിരുണ്ടു പൊള്ളുന്നു   
277   FAKE_MAL_TR_0278  ചാർമാഡി ഘട്ടിൽ ജൂലൈ 25 ന് നടന്ന വെള്ളപൊക്കത്തി...   
1754  FAKE_MAL_TR_1755  കാശ്മീരിലെ മച്ചില്‍ എന്‍കൗണ്ടറില്‍ അവസാനം മോദി...   
358   FAKE_MAL_TR_0359  കർണാടക പോളിംഗ് സ്‌റ്റേഷനിലെ കള്ളവോട്ട് ദൃശ്യങ്...   
1053  FAKE_MAL_TR_1054  സ്വപ്ന സുരേഷിനെ പിടികൂടിയത് ബാംഗ്ലൂരിൽ പിണറായി...   

             Label  
1565         FALSE  
277          FAL

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
])

model.fit(train_df['News'], train_df['Label'])
y_train_pred = model.predict(train_df['News'])
y_test_pred = model.predict(test_df['News'])

train_f1 = f1_score(train_df['Label'], y_train_pred, average='macro')
test_f1 = f1_score(test_df['Label'], y_test_pred, average='macro')
print(train_f1)
print(test_f1)

0.2973595097488549
0.202854912456786
