In [1]:
import pandas as pd
import joblib

combined_data = pd.read_pickle('/home/jovyan/workdir/document_classification/data/v0/combined_data.pkl')

train_df = combined_data['train']
test_df = combined_data['test']

train_df

Unnamed: 0,filename,text,num_pages,target_col,text_length
0,doc_0.pdf,EF400 System # EF408B\nDIE CAST CYLINDRICAL L...,1.0,lighting,2674
1,doc_2.pdf,EF400 System # EF407B\nDIE CAST CYLINDRICAL L...,1.0,lighting,2134
2,doc_3.pdf,ADJ UST ABLE BEAM WALL WASH 12V\n5221\nORDER...,4.0,lighting,8322
3,doc_4.pdf,Type:\nProject:HDMC\nSurface Mount with FAR-UV...,11.0,lighting,38897
4,doc_6.pdf,Extruded Aluminum\nHousing\nSatin Acrylic\nDif...,1.0,lighting,2060
...,...,...,...,...,...
1274,doc_2511.pdf,EM ELECTRET \nCONDENSER \nMICROPHONE \nAcousti...,8.0,others,8850
1275,doc_2536.pdf,FLYER // AUTOMOTIVE POWER SUPPLY // ORDER CODE...,1.0,others,531
1276,doc_2556.pdf,"GlobTek, Inc.\nwww.globtek.com\n186 Veterans D...",20.0,others,46777
1277,doc_2568.pdf,Mechanical Dimensions: Inches [mm]\nMounting H...,2.0,others,570


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Create pipeline with TF-IDF vectorizer and classifier
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(
        max_features=100,  # Limit features to top 10k most frequent terms
        min_df=20,  # Remove terms that appear in less than 2 documents
        max_df=0.75,  # Remove terms that appear in more than 95% of documents
        stop_words="english",
    )),
    ('classifier', LogisticRegression(
        random_state=42,
        penalty='l2',      # L2 regularization
        solver='lbfgs',    # Algorithm to use
        max_iter=1000      # Maximum number of iterations
    ))
])

# Define parameter grid for C
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

# Create grid search object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search on training data
grid_search.fit(train_df['text'], train_df['target_col'])

# Print best parameters
print("\nBest Parameters:")
print(grid_search.best_params_)

# Use best model for predictions
best_model = grid_search.best_estimator_

# Make predictions on training data
y_train_pred = best_model.predict(train_df['text'])

# Make predictions on test data
y_test_pred = best_model.predict(test_df['text'])

# Print classification metrics for training data
print("\nTraining Classification Report:")
print(classification_report(train_df["target_col"], y_train_pred))

print("\nConfusion Matrix (Train):")
print(confusion_matrix(train_df["target_col"], y_train_pred))

# Print classification metrics for test data
print("\nTest Classification Report:")
print(classification_report(test_df["target_col"], y_test_pred))

print("\nConfusion Matrix (Test):")
print(confusion_matrix(test_df["target_col"], y_test_pred))

# Get feature importance scores
vectorizer = best_model.named_steps["vectorizer"]
classifier = best_model.named_steps["classifier"]
feature_importance = pd.DataFrame(
    {
        "feature": vectorizer.get_feature_names_out(),
        "importance": abs(
            classifier.coef_[0]
        ),  # Use absolute values of coefficients for importance
    }
)
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values("importance", ascending=False).head(10))


Best Parameters:
{'classifier__C': 100.0}

Training Classification Report:
              precision    recall  f1-score   support

       cable       1.00      1.00      1.00       310
       fuses       1.00      1.00      1.00       495
    lighting       1.00      1.00      1.00       325
      others       1.00      1.00      1.00       149

    accuracy                           1.00      1279
   macro avg       1.00      1.00      1.00      1279
weighted avg       1.00      1.00      1.00      1279


Confusion Matrix (Train):
[[310   0   0   0]
 [  0 495   0   0]
 [  0   0 325   0]
 [  0   0   0 149]]

Test Classification Report:
              precision    recall  f1-score   support

       cable       0.70      1.00      0.82        64
       fuses       1.00      1.00      1.00        39
    lighting       0.96      1.00      0.98        65
      others       1.00      0.37      0.54        49

    accuracy                           0.86       217
   macro avg       0.91      0

In [3]:
joblib.dump(
    pipeline,
    "/home/jovyan/workdir/document_classification/artifacts/models/lr_classifier_pipeline.joblib",
)

['/home/jovyan/workdir/document_classification/artifacts/models/lr_classifier_pipeline.joblib']

In [4]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(
    [
        (
            "vectorizer",
            TfidfVectorizer(
                max_features=100, 
                min_df=2,  
                max_df=0.75,  
                stop_words="english",
            ),
        ),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)


pipeline.fit(train_df["text"], train_df["target_col"])


y_train_pred = pipeline.predict(train_df["text"])


y_test_pred = pipeline.predict(test_df["text"])


print("\nTraining Classification Report:")
print(classification_report(train_df["target_col"], y_train_pred))

print("\nConfusion Matrix (Train):")
print(confusion_matrix(train_df["target_col"], y_train_pred))

print("\nTest Classification Report:")
print(classification_report(test_df["target_col"], y_test_pred))

print("\nConfusion Matrix (Test):")
print(confusion_matrix(test_df["target_col"], y_test_pred))


vectorizer = pipeline.named_steps["vectorizer"]
classifier = pipeline.named_steps["classifier"]
feature_importance = pd.DataFrame(
    {
        "feature": vectorizer.get_feature_names_out(),
        "importance": classifier.feature_importances_,  # RandomForest uses feature_importances_
    }
)
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values("importance", ascending=False).head(1000))


Training Classification Report:
              precision    recall  f1-score   support

       cable       1.00      1.00      1.00       310
       fuses       1.00      1.00      1.00       495
    lighting       1.00      1.00      1.00       325
      others       1.00      1.00      1.00       149

    accuracy                           1.00      1279
   macro avg       1.00      1.00      1.00      1279
weighted avg       1.00      1.00      1.00      1279


Confusion Matrix (Train):
[[310   0   0   0]
 [  0 495   0   0]
 [  0   0 325   0]
 [  0   0   0 149]]

Test Classification Report:
              precision    recall  f1-score   support

       cable       0.94      1.00      0.97        64
       fuses       1.00      1.00      1.00        39
    lighting       0.98      1.00      0.99        65
      others       1.00      0.90      0.95        49

    accuracy                           0.98       217
   macro avg       0.98      0.97      0.98       217
weighted avg       

In [5]:


joblib.dump(
    pipeline,
    "/home/jovyan/workdir/document_classification/artifacts/models/random_forest_classifier_pipeline.joblib",
)

['/home/jovyan/workdir/document_classification/artifacts/models/random_forest_classifier_pipeline.joblib']

In [1]:
test_df.head()

NameError: name 'test_df' is not defined