In [1]:
import pandas as pd
from vectorizer import vectorize_csv

csv_path = input("Enter path to training data (e.g. path/train_data.csv): ")
text_col = input("Enter text column name: ")
target_col = input("Enter target column name: ")

df = pd.read_csv(csv_path, usecols=[text_col, target_col])
y = df[target_col]

X, feature_names, vectorizer = vectorize_csv(
    csv_path,
    text_col,
    analyzer="char_wb",
    ngram_range=(3, 4),
    max_features=1000,
    lowercase=False
)

In [None]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

seed = 42
model_id = input("Enter unique ID to label your saved model and vectorizer:")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print("Model Validation Results:\n")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall: .4f}")
    print(f"F1-score: {f1: .4f}\n")

# lr_model = LogisticRegression(               
#     solver='saga',     # optimal for large, sparse datasets (e.g. n-gram vectors)                                
#     penalty='elasticnet',    # mix of l1(sets useless features weights to 0) and l2(keeps all features but shrinks weight down)
#     l1_ratio=0.5,  # for elasticnet, set l1:l2 
#     C=1.0,             # penalty strictness
#     class_weight='balanced', 
#     max_iter=3000,         
#     tol=1e-3,      
#     random_state=seed,
#     n_jobs=-1
# )

lr_model = LogisticRegression(               
    solver='saga',                                     
    max_iter=10000,               
    random_state=seed,
    n_jobs=-1
)

lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

evaluate_model(lr_model, X_test, y_test)

joblib.dump(lr_model, f"logreg_model_{model_id}")
joblib.dump(vectorizer, f"vectorizer_{model_id}")

Model Validation Results:

Accuracy: 0.9187
Precision: 0.8965
Recall:  0.8088
F1-score:  0.8504





['vectorizer_R1']