In [12]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [13]:
data=pd.read_csv("F:/PROJECT_PIPELINE_MLOPS/SMART_CARRER_ADVISOR/data/Resume.csv")

In [14]:
data.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [15]:
# Check the original category distribution
print("Original Category Distribution:")
print(data['Category'].value_counts())
# Get the largest category size (i.e., the category with the maximum number of entries)
max_size = data['Category'].value_counts().max()
# Perform oversampling
balanced_df = data.groupby('Category').apply(lambda x: x.sample(max_size, replace=True)).reset_index(drop=True)
# Shuffle the dataset to avoid any order bias
data = balanced_df.sample(frac=1).reset_index(drop=True)
# Check the balanced category distribution
print("\nBalanced Category Distribution (After Oversampling):")
print(data['Category'].value_counts())

Original Category Distribution:
Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: count, dtype: int64

Balanced Category Distribution (After Oversampling):
Category
Blockchain                   84
Mechanical Engineer          84
Electr

In [16]:
import re
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText
data['Resume'] = data['Resume'].apply(lambda x: cleanResume(x))

In [17]:
data.head()

Unnamed: 0,Category,Resume
0,Blockchain,SOFTWARE SKILLS Languages C C java Operating S...
1,Civil Engineer,PERSONAL SKILLS Passionate towards learning ne...
2,Web Designing,Education Details January 2016 B Sc Informatio...
3,PMO,AREA OF EXPE ISE PROFILE Around 10 plus years ...
4,Automation Testing,SKILLS Agile Methodology Scrum Kanban Extreme ...


In [18]:
import pandas as pd
import re
import nltk
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data['Category'])
data['Category'] = le.transform(data['Category'])

tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(data['Resume'])
requiredText = tfidf.transform(data['Resume'])

X_train, X_test, y_train, y_test = train_test_split(requiredText, data['Category'], test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

model_path = 'F:\\PROJECT_PIPELINE_MLOPS\\SMART_CARRER_ADVISOR\\models\\'


with open(model_path + 'model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(model_path + 'tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open(model_path + 'label_encoders.pkl', 'wb') as f:
    pickle.dump(le, f)

print("✅ Model, TF-IDF Vectorizer, and Label Encoder saved successfully!")




✅ Model, TF-IDF Vectorizer, and Label Encoder saved successfully!


In [20]:
# text english
# 8. Accuracy বের করা ✅
from sklearn.metrics import accuracy_score
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

print(f"✅ train Accuracy: {train_acc:.4f}")
print(f"✅ test Accuracy: {test_acc:.4f}")

✅ train Accuracy: 1.0000
✅ test Accuracy: 1.0000


In [23]:
from sklearn.metrics import classification_report
test_preds = model.predict(X_test)
report = classification_report(y_test, test_preds, target_names=le.classes_)
print("✅ Classification Report:\n")
print(report)

✅ Classification Report:

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00        16
                     Arts       1.00      1.00      1.00        15
       Automation Testing       1.00      1.00      1.00        17
               Blockchain       1.00      1.00      1.00        20
         Business Analyst       1.00      1.00      1.00        24
           Civil Engineer       1.00      1.00      1.00        22
             Data Science       1.00      1.00      1.00         9
                 Database       1.00      1.00      1.00        15
          DevOps Engineer       1.00      1.00      1.00        16
         DotNet Developer       1.00      1.00      1.00        14
            ETL Developer       1.00      1.00      1.00        18
   Electrical Engineering       1.00      1.00      1.00        20
                       HR       1.00      1.00      1.00        18
                   Hadoop       1.0

In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, requiredText, data['Category'], cv=5)
print("Cross-validation scores:", scores)
print("Average CV Score:", scores.mean())


Cross-validation scores: [1.        1.        1.        1.        0.9952381]
Average CV Score: 0.9990476190476191


In [25]:
label_mapping = dict(enumerate(le.classes_))
print("Label Encoding Mapping:")
for label, category in label_mapping.items():
    print(f"{label}: {category}")

Label Encoding Mapping:
0: Advocate
1: Arts
2: Automation Testing
3: Blockchain
4: Business Analyst
5: Civil Engineer
6: Data Science
7: Database
8: DevOps Engineer
9: DotNet Developer
10: ETL Developer
11: Electrical Engineering
12: HR
13: Hadoop
14: Health and fitness
15: Java Developer
16: Mechanical Engineer
17: Network Security Engineer
18: Operations Manager
19: PMO
20: Python Developer
21: SAP Developer
22: Sales
23: Testing
24: Web Designing
