In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
import joblib
import time

# Load the features and labels
print("Loading data...")
X_train = np.load('../data/processed/X_train.npy')
y_train = np.load('../data/processed/y_train.npy')

# Scale the features
print("Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Save the scaler for later use
joblib.dump(scaler, '../models/scaler.joblib')

# Logistic Regression
print("Training Logistic Regression...")
start_time = time.time()
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
print(f"Logistic Regression training completed in {time.time() - start_time:.2f} seconds")
joblib.dump(lr_model, '../models/logistic_regression.joblib')

# Random Forest (with further reduced complexity)
print("Training Random Forest...")
start_time = time.time()
rf_model = RandomForestClassifier(
    n_estimators=100,  # You can try reducing this, e.g., to 50
    max_depth=10,      # Limit the depth of the trees
    min_samples_split=20,  # Require more samples to split an internal node
    min_samples_leaf=10,   # Require more samples in a leaf
    max_features='sqrt',   # Use sqrt(n_features) features in each tree
    n_jobs=-1,             # Use all available cores
    random_state=42
)
rf_model.fit(X_train, y_train)  # Note: RF doesn't require scaling
print(f"Random Forest training completed in {time.time() - start_time:.2f} seconds")
joblib.dump(rf_model, '../models/random_forest.joblib')

# Support Vector Machine (LinearSVC with reduced complexity)
print("Training SVM (SGD approximation)...")
start_time = time.time()
svm_sgd_model = SGDClassifier(
    loss='hinge',  # This makes it approximate SVM
    penalty='l2',
    alpha=0.0001,  # Equivalent to C in SVM
    max_iter=1000,
    tol=1e-3,
    random_state=42
)
svm_sgd_model.fit(X_train_scaled, y_train)
print(f"SVM (SGD) training completed in {time.time() - start_time:.2f} seconds")
joblib.dump(svm_sgd_model, '../models/svm_sgd.joblib')

# Naive Bayes
print("Training Naive Bayes...")
start_time = time.time()
nb_model = MultinomialNB(alpha=1.0)  # Increase smoothing parameter
nb_model.fit(X_train, y_train)  # Using X_train instead of X_train_selected
print(f"Naive Bayes training completed in {time.time() - start_time:.2f} seconds")
joblib.dump(nb_model, '../models/naive_bayes.joblib')

print("Model building completed and models saved.")

Loading data...
Scaling features...
Training Logistic Regression...
Logistic Regression training completed in 7.95 seconds
Training Random Forest...
Random Forest training completed in 31.72 seconds
Training SVM (SGD approximation)...
SVM (SGD) training completed in 213.59 seconds
Training Naive Bayes...
Naive Bayes training completed in 3.32 seconds
Model building completed and models saved.
