In [1]:
pip install flask scikit-learn xgboost pandas numpy joblib


Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
from flask import Flask, request, jsonify

# Load dataset (Modify the path accordingly)
df = pd.read_csv(r"C:\Users\anish\OneDrive\Desktop\D_S_PROJECT\cleaned_data.csv")
df

Unnamed: 0,Categories,Resume_Details
0,0,anubhav kumar singh core competency scripting ...
1,0,ananda rayudu profile summary year experience ...
2,0,peoplesoft database administrator gangareddy p...
3,0,classification internal classification interna...
4,0,priyanka ramadoss mountpleasant coonoor nilgir...
...,...,...
74,3,workday integration consultant name sri krishn...
75,3,srikanth workday hcm consultant seeking suitab...
76,3,workday hcm fcm name kumar role workday consul...
77,3,venkateswarlu workday consultant professional ...


In [3]:
print(df.columns)  # Shows all column names


Index(['Categories', 'Resume_Details'], dtype='object')


In [4]:

# Assuming 'text' is the feature column and 'label' is the target
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df['Resume_Details'])
y = df['Categories']
print(X)
print(y)

  (0, 182)	0.03136723387887679
  (0, 1896)	0.04261079090134825
  (0, 3191)	0.057309581492190065
  (0, 777)	0.026502338164750713
  (0, 655)	0.050475033080870806
  (0, 3080)	0.2271376488639186
  (0, 3150)	0.2271376488639186
  (0, 193)	0.22377127428567403
  (0, 2499)	0.3257953686482166
  (0, 1543)	0.07218894675237507
  (0, 1326)	0.039925934231947
  (0, 2362)	0.022000536212650496
  (0, 1967)	0.16099309120513047
  (0, 3827)	0.05123911626579028
  (0, 3804)	0.09571519762487549
  (0, 187)	0.06667377315213993
  (0, 3123)	0.3248502603856879
  (0, 3805)	0.14913776815471888
  (0, 3622)	0.14420395064457384
  (0, 2812)	0.04803567716691178
  (0, 2378)	0.11576677631483896
  (0, 2915)	0.04597323112877059
  (0, 1441)	0.06895984669315589
  (0, 3842)	0.009196297614892966
  (0, 1449)	0.028654790746095032
  :	:
  (78, 3845)	0.3722901561015264
  (78, 2405)	0.0292192126748219
  (78, 710)	0.1491454914460804
  (78, 1111)	0.20046393020851425
  (78, 3862)	0.0292192126748219
  (78, 669)	0.03730216571418394
  (78, 

In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

(<63x3893 sparse matrix of type '<class 'numpy.float64'>'
 	with 14432 stored elements in Compressed Sparse Row format>,
 <16x3893 sparse matrix of type '<class 'numpy.float64'>'
 	with 3896 stored elements in Compressed Sparse Row format>,
 73    3
 61    3
 55    2
 40    1
 9     0
      ..
 20    1
 60    3
 71    3
 14    0
 51    2
 Name: Categories, Length: 63, dtype: int64,
 30    1
 0     0
 22    1
 31    1
 18    0
 28    1
 10    0
 70    3
 4     0
 12    0
 49    2
 33    1
 67    3
 35    1
 68    3
 45    2
 Name: Categories, dtype: int64)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((63, 3893), (16, 3893), (63,), (16,))

In [7]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Create and save a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Save the vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Vectorizer saved successfully!")

joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

Vectorizer saved successfully!


['tfidf_vectorizer.pkl']

In [8]:
# Define parameter grids
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
param_grid_dt = {
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

param_grid_gb = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 10]
}

# Create models
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

# Perform GridSearchCV
grid_dt = GridSearchCV(dt, param_grid_dt, cv=3, n_jobs=-1, verbose=1)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=3, n_jobs=-1, verbose=1)
grid_gb = GridSearchCV(gb, param_grid_gb, cv=3, n_jobs=-1, verbose=1)

# Fit models
grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)

# Get the best models
best_dt = grid_dt.best_estimator_
best_rf = grid_rf.best_estimator_
best_gb = grid_gb.best_estimator_

# Save the best models
joblib.dump(best_dt, "best_decision_tree.pkl")
joblib.dump(best_rf, "best_random_forest.pkl")
joblib.dump(best_gb, "best_gradient_boosting.pkl")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits


['best_gradient_boosting.pkl']

In [9]:
# Save the best GradientBoosting model
with open("gradient_boosting.pkl", "wb") as model_file:
    pickle.dump(grid_gb.best_estimator_, model_file)

# Save the vectorizer
with open("tfidf.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("✅ Models and vectorizer saved successfully!")

✅ Models and vectorizer saved successfully!


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
