In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load the dataset
data = pd.read_csv('placements dataset.csv')

# Check the class distribution
print("Class distribution in job_role column:")
print(data['Job Role'].value_counts())

# Preprocess the dataset
data = data.dropna()  # Remove missing values
X = data['Skills']    # Assuming 'skills' column has the features
y = data['Job Role']  # Assuming 'job_role' column is the target variable

# Convert skills to TF-IDF features instead of one-hot encoding
vectorizer = TfidfVectorizer(max_features=500)  # Limit to 500 features to avoid overfitting
X = vectorizer.fit_transform(X).toarray()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest model with GridSearch for hyperparameter tuning
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Grid search with cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Use the best model from grid search
model = grid_search.best_estimator_

# Train the model with the best parameters
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model and vectorizer
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer have been saved successfully.")


Class distribution in job_role column:
Job Role
Full Stack Developer            70
UI/UX Designer                  66
System Analyst                  63
Cloud Architect                 62
IT Project Manager              61
Backend Developer               58
Data Scientist                  57
Frontend Developer              57
DevOps Engineer                 56
Cybersecurity Specialist        55
Network Engineer                55
Mobile App Developer            53
Database Administrator          52
Business Analyst                50
Game Developer                  50
Quality Assurance Engineer      49
Technical Support Specialist    45
Machine Learning Engineer       41
Name: count, dtype: int64
Model Accuracy: 94.33%

Classification Report:
                              precision    recall  f1-score   support

           Backend Developer       1.00      1.00      1.00        16
            Business Analyst       0.00      0.00      0.00        17
             Cloud Architect       1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
