In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import ssl

# Bypassing the SSL certificate for NLTK downloads
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download necessary NLTK datasets
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
file_path = 'Resume.csv'  # Replace with your file path
resume_data = pd.read_csv(file_path)

# Initialize the lemmatizer and stop words set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Data cleaning and preprocessing function
def clean_and_lemmatize_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"[^a-zA-Z]", ' ', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply the cleaning and lemmatizing function
resume_data['processed_resume'] = resume_data['Resume_str'].apply(clean_and_lemmatize_text)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X = tfidf_vectorizer.fit_transform(resume_data['processed_resume'])

# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(resume_data['Category'])

# Split the dataset and keep track of the indices
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, resume_data.index, test_size=0.2, random_state=42)

# Use the Gradient Boosting Machine model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Model evaluation
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leosong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/leosong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.7142857142857143
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93        29
           1       0.92      0.77      0.84        30
           2       0.40      0.50      0.44         8
           3       0.67      0.50      0.57        20
           4       0.40      0.67      0.50        18
           5       0.12      0.17      0.14         6
           6       0.83      0.90      0.86        21
           7       0.55      0.48      0.51        23
           8       0.00      0.00      0.00         2
           9       0.74      0.52      0.61        27
          10       0.95      0.79      0.86        24
          11       0.86      0.91      0.89        34
          12       0.68      0.65      0.67        20
          13       0.78      0.95      0.86        19
          14       0.83      0.60      0.70        25
          15       0.74      0.67      0.70        21
          16       0.57     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# Extract key columns for the output
output_data = resume_data.loc[idx_test, ['Resume_str']]  # Replace 'Resume_str' with the actual column name for resume content
output_data['Original_Category'] = resume_data.loc[idx_test, 'Category']  # Include this line only if the original category is available
output_data['Cleaned_Resume_Text'] = resume_data.loc[idx_test, 'processed_resume']
output_data['Predicted_Category'] = predicted_categories

output_data.rename(columns={'Resume_str': 'Original_Resume_Text'}, inplace=True)

output_data.sort_values(by='Predicted_Category', inplace=True)

output_data.to_csv('clear_predicted_resume_categories.csv', index=False)
