In [1]:
#Necessary Libraries
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

#Preprocessing
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv('/kaggle/input/depi-r-2-competition-1/xy_train.csv')
df.head()

Unnamed: 0,ID,text,label
0,0,Americans Aren't Sure If Flight 370 Vanished T...,1
1,1,Pope Leo X offering indulgences to sinners sho...,0
2,2,News: 5 Uplifting Hypotheticals Of What Could ...,0
3,3,George W. Bush and Jeb Bush at a campaign stop...,0
4,4,The WADA requesting their fair share from Lanc...,0


In [3]:
def preprocess_text(text):
    #Convert text to lowercase
    text = text.lower()
    #Remove Special Characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    #Tokenize
    tokens = word_tokenize(text)
    #Remove Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

In [4]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])

X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, df['label'], test_size=0.2, random_state=3)

In [5]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'Multinomial NB': MultinomialNB(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier()
}

In [6]:
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"\n--- {name} Results ---")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)


--- Logistic Regression Results ---
Accuracy: 0.7682291666666666
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.79      0.79      5179
           1       0.75      0.75      0.75      4377
           2       0.00      0.00      0.00        44

    accuracy                           0.77      9600
   macro avg       0.51      0.51      0.51      9600
weighted avg       0.76      0.77      0.77      9600


--- Random Forest Results ---
Accuracy: 0.7538541666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.78      0.77      5179
           1       0.73      0.73      0.73      4377
           2       1.00      0.02      0.04        44

    accuracy                           0.75      9600
   macro avg       0.83      0.51      0.52      9600
weighted avg       0.75      0.75      0.75      9600


--- Gradient Boosting Results ---
Accuracy: 0.6769791666666667
Cl

In [7]:
test_df = pd.read_csv('/kaggle/input/depi-r-2-competition-1/x_test.csv')
test_df['processed_text'] = test_df['text'].apply(preprocess_text)

X_test_tfidf = tfidf_vectorizer.transform(test_df['processed_text'])

In [8]:
for name, model in models.items():
    model.fit(X_tfidf, df['label'])
    y_pred = model.predict(X_test_tfidf)

    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'label': y_pred.ravel()
    })

    submission.to_csv(name[:5]+'_submission.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.199286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 143573
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 4890
[LightGBM] [Info] Start training from score -0.620904
[LightGBM] [Info] Start training from score -0.779477
[LightGBM] [Info] Start training from score -5.547848
Learning rate set to 0.096398
0:	learn: 1.0432387	total: 802ms	remaining: 13m 21s
1:	learn: 0.9978883	total: 1.3s	remaining: 10m 48s
2:	learn: 0.9572297	total: 1.79s	remaining: 9m 56s
3:	learn: 0.9213449	total: 2.29s	remaining: 9m 29s
4:	learn: 0.8898034	total: 2.78s	remaining: 9m 13s
5:	learn: 0.8629721	total: 3.27s	remaining: 9m 2s
6:	learn: 0.8392938	total: 3.77s	remaining: 8m 55s
7:	learn: 0.8189797	total: 4.26s	remaining: 8m 48s
8:	learn: 0.8004301	total: 4.75s	remaining: 8m 43s
9:	learn: 0.7842118	total: 5.24s	remaining: 8m 38s
10:	l