In [None]:
!pip install pandas scikit-learn

In [None]:
%%writefile custom_transformer.py

from sklearn.base import TransformerMixin
import numpy as np

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return np.asarray(X.todense())

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import joblib

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('to_dense', DenseTransformer()),
    ('clf', GaussianNB())
])

cwd = os.getcwd()
data_dir = os.path.join(os.path.dirname(cwd), "data")
data = pd.read_csv(os.path.join(data_dir, "data.csv"))
data['is_positive'] = data['is_positive'].map({'t': 1, 'f': 0}).astype('float').values

X_train, X_test, y_train, y_test = train_test_split(data['title'], data['is_positive'], test_size=0.2, random_state=42, stratify=data['is_positive'])
print("Training set class distribution:", y_train.value_counts(normalize=True))
print("Test set class distribution:", y_test.value_counts(normalize=True))
classes = np.array([0, 1])
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))
sample_weights = np.array([class_weights_dict[label] for label in y_train])
pipeline.fit(X_train, y_train, clf__sample_weight=sample_weights)

In [None]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
joblib_file = os.path.join(os.path.dirname(os.getcwd()), "data", "tf-idf-nb.joblib")
joblib.dump(pipeline, joblib_file)