In [1]:
import pandas as pd
df_train = pd.read_csv('data/cleaned_train.csv')
df_test = pd.read_csv('data/cleaned_test.csv')

In [2]:
from sklearn.model_selection import train_test_split
X = df_train['cleaned_text']
y = df_train['target'] 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
    ngram_range=(1,3),       # unigrams + bigrams
    min_df=2,                # ignore rare words
    max_features=50000,      # cap vocabulary
    stop_words='english'
)

In [9]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)


In [29]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=1000,
    C=1.0,
    n_jobs=-1,
    penalty='l2',
    solver='newton-cg',

)

log_reg.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

y_pred = log_reg.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.8040966386554622
F1 Score: 0.7518296739853626

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.89      0.84      1091
           1       0.82      0.69      0.75       813

    accuracy                           0.80      1904
   macro avg       0.81      0.79      0.80      1904
weighted avg       0.81      0.80      0.80      1904



In [35]:
df_train_tfidf = tfidf.fit_transform(df_train['cleaned_text'])

In [37]:
log_reg.fit(df_train_tfidf, df_train['target'])

In [None]:
df_test_tfidf = tfidf.transform(df_test['cleaned_text'])

In [40]:
preds = log_reg.predict(df_test_tfidf)

In [47]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "target": preds
})

submission.to_csv("logistic_submission.csv", index=False)

In [46]:
submission.head(10)

Unnamed: 0,id,preds
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [56]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
#nb = MultinomialNB()
cb = ComplementNB()
#nb.fit(X_train_tfidf, y_train)
cb.fit(X_train_tfidf, y_train)

In [57]:
nb_y_preds = cb.predict(X_test_tfidf)

In [58]:
print("Accuracy:", accuracy_score(y_test, nb_y_preds))
print("F1 Score:", f1_score(y_test, nb_y_preds))
print("\nClassification Report:\n")
print(classification_report(y_test, nb_y_preds))


Accuracy: 0.8009453781512605
F1 Score: 0.7530944625407167

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1091
           1       0.80      0.71      0.75       813

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904



In [59]:
from sklearn.model_selection import GridSearchCV
param_grid_nb = {
    "alpha": [0.1, 0.3, 0.5, 0.7, 1.0, 2.0]
}


grid_nb = GridSearchCV(
    cb,
    param_grid_nb,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=2
)

grid_nb.fit(X_train_tfidf, y_train)

print("Best NB params:", grid_nb.best_params_)
print("Best NB F1:", grid_nb.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.3; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.3; total time=   0.0s
[CV] END ..........................................alpha=0.3; total time=   0.0s
[CV] END ..........................................alpha=0.3; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.3; total time=   0.0s
[CV] END ........................................

In [60]:
nb_grid_preds = grid_nb.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, nb_grid_preds))
print("F1 Score:", f1_score(y_test, nb_grid_preds))
print("\nClassification Report:\n")
print(classification_report(y_test, nb_grid_preds))

Accuracy: 0.8009453781512605
F1 Score: 0.7530944625407167

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1091
           1       0.80      0.71      0.75       813

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904



In [64]:
cb.fit(df_train_tfidf, df_train['target'])

In [65]:
cb_preds_test = cb.predict(df_test_tfidf)

In [67]:
cb_submission = pd.DataFrame({
    'id': df_test['id'],
    'target': cb_preds_test
})
cb_submission.to_csv('CB_submission.csv', index=False)