In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Ensure you've installed imbalanced-learn: pip install imbalanced-learn

# Load the cleaned data
cleaned_dataset_path = '../data/interim/cleaned_resumes.csv'
df = pd.read_csv(cleaned_dataset_path)

# Fill any NaN values with an empty string
df['cleaned_resume'] = df['cleaned_resume'].fillna('')

# Initialize the TF-IDF vectorizer with the optimized number of features
vectorizer = TfidfVectorizer(max_features=5020, ngram_range=(1, 3))  # Use (1, 3) for tri-grams

# Fit and transform the cleaned resumes
X = vectorizer.fit_transform(df['cleaned_resume'])
y = df['Category']

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data for class balancing
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Retrain the classifier with the balanced training data
clf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
clf_smote.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_smote = clf_smote.predict(X_test)

# Print the classification report
print('Classification report for RandomForest with SMOTE:\n', classification_report(y_test, y_pred_smote))

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=3, 
    n_jobs=-1, 
    verbose=2
)

grid_search.fit(X_train_smote, y_train_smote)

print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator to make predictions
y_pred_grid_search = grid_search.best_estimator_.predict(X_test)

# Print the classification report for the best estimator
print('Classification report for RandomForest after GridSearchCV:\n', classification_report(y_test, y_pred_grid_search))



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for RandomForest with SMOTE:
                         precision    recall  f1-score   support

            ACCOUNTANT       0.90      0.97      0.93        29
              ADVOCATE       0.76      0.73      0.75        30
           AGRICULTURE       0.38      0.38      0.38         8
               APPAREL       0.60      0.45      0.51        20
                  ARTS       0.22      0.11      0.15        18
            AUTOMOBILE       0.40      0.33      0.36         6
              AVIATION       0.78      0.86      0.82        21
               BANKING       0.73      0.48      0.58        23
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.96      0.85      0.90        27
                  CHEF       0.95      0.75      0.84        24
          CONSTRUCTION       0.94      0.91      0.93        34
            CONSULTANT       0.88      0.70      0.78        20
              DESIGNER       0.82      0.95      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
