In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r"./dataset/temp_df.csv")

In [4]:
df.head(10)

Unnamed: 0,interest_area,coding_interest_level,preferred_domain,current_degree,field_of_study,current_status,known_languages,frameworks_known,job_type_preference,job_experience,tech_inclination,recommended_job_role
0,Creativity,Advanced,Data Science,Masters,CS,Intern,"C,C++,Python,Java,SQL,JavaScript","Scikit-learn, PyTorch, Flask",Remote,,High,Data Analyst
1,Logic,Intermediate,AI/ML,Bachelors,Non-CS,Job Seeker,C,,Internship,3+,High,ML Engineer
2,Logic,,Backend,Diploma,CS,Intern,Java,Django,Internship,3+,Medium,Backend Developer
3,Logic,Advanced,Backend,Masters,Non-CS,Trainee,JavaScript,"PyTorch, Flask, Scikit-learn",Internship,3+,High,Developer
4,Both,Intermediate,AI/ML,Diploma,Non-CS,Job Seeker,"R, JavaScript, Java","TensorFlow, React",Internship,1+,Low,Frontend Developer
5,Both,Advanced,Backend,Masters,CS,Intern,Python,"Scikit-learn, TensorFlow",Internship,3+,Medium,Backend Developer
6,Creativity,Intermediate,Cybersecurity,Bachelors,Non-CS,Job Seeker,"SQL, C, R","React, PyTorch",Internship,3+,Low,Cybersecurity Analyst
7,Both,,Frontend,Bachelors,Non-CS,Pursuing Degree,Python,"Flask, Django",Remote,,High,Intern
8,Creativity,Intermediate,DevOps,Bachelors,Non-CS,Job Seeker,"SQL, JavaScript, C++",React,Internship,3+,Low,Frontend Developer
9,Both,Advanced,DevOps,Bachelors,CS,Job Seeker,"C, SQL, JavaScript","TensorFlow, Scikit-learn, PyTorch",Freelance,,High,DevOps


In [5]:
#data clearing
"""
The data contains different NA values.
Some of the attributes may not be necessary for the result
"""


'\nThe data contains different NA values.\nSome of the attributes may not be necessary for the result\n'

In [6]:
df =df.drop('tech_inclination' , axis = 1)

In [7]:
#NaN values
df.isna().sum()

interest_area             0
coding_interest_level    16
preferred_domain          0
current_degree            0
field_of_study            0
current_status            0
known_languages           2
frameworks_known          6
job_type_preference       0
job_experience           41
recommended_job_role      0
dtype: int64

In [8]:
df['job_experience'] = df['job_experience'].fillna(0)
df['coding_interest_level'] = df['coding_interest_level'].fillna("Beginner")
df = df.dropna()

In [9]:
#Nan Values handled
df.isna().sum()

interest_area            0
coding_interest_level    0
preferred_domain         0
current_degree           0
field_of_study           0
current_status           0
known_languages          0
frameworks_known         0
job_type_preference      0
job_experience           0
recommended_job_role     0
dtype: int64

In [10]:
#Values of different attributes
print(f"interest_area :  {df['interest_area'].unique()}\n\n"
    f"coding_interest_level : {df['coding_interest_level'].unique()}\n\n"
    f"preferred_domain : {df['preferred_domain'].unique()}\n\n"
    f"current_degree : {df['current_degree'].unique()}\n\n"
    f"current_status : {df['current_status'].unique()}\n\n"
    f"job_type_preference : {df['job_type_preference'].unique()}\n\n"
    f"job_experience :{df['job_experience'].unique()}\n\n"
     )

interest_area :  ['Creativity' 'Logic' 'Both']

coding_interest_level : ['Advanced' 'Beginner' 'Intermediate']

preferred_domain : ['Data Science' 'Backend' 'AI/ML' 'Cybersecurity' 'Frontend' 'DevOps'
 'Full Stack']

current_degree : ['Masters' 'Diploma' 'Bachelors']

current_status : ['Intern' 'Trainee' 'Job Seeker' 'Pursuing Degree' 'Working Professional']

job_type_preference : ['Remote' 'Internship' 'Freelance' 'Full-time Job' 'Onsite']

job_experience :[0 '3+' '1+']




In [11]:
#Label Encoding attributes using sklearn's LbelEncoder class 
from sklearn.preprocessing import LabelEncoder

encoders = {}  # Store encoders for inverse_transform or future transform

label_columns = [
    'interest_area',
    'coding_interest_level',
    'preferred_domain',
    'current_degree',
    'current_status',
    'job_type_preference',
    'field_of_study'
]

for col in label_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

    # Store encoder obkect for each attribute
    encoders[col] = le


In [12]:
def convert_to_int(x):
    if isinstance(x, int):
        return x
    elif isinstance(x, str) and x:  # check if it's a non-empty string
        return int(x[0]) if x[0].isdigit() else None #return the 1st char if it is a digit 
    else:
        return None

#convert string attribute to int
df['job_experience'] = df['job_experience'].apply(lambda x :convert_to_int(x))

In [13]:
#Values of different attributes after Label Encoding
print(f"interest_area :  {df['interest_area'].unique()}\n\n"
    f"coding_interest_level : {df['coding_interest_level'].unique()}\n\n"
    f"preferred_domain : {df['preferred_domain'].unique()}\n\n"
    f"current_degree : {df['current_degree'].unique()}\n\n"
    f"current_status : {df['current_status'].unique()}\n\n"
    f"job_type_preference : {df['job_type_preference'].unique()}\n\n"
    f"job_experience :{df['job_experience'].unique()}\n\n"
     )

interest_area :  [1 2 0]

coding_interest_level : [0 1 2]

preferred_domain : [3 1 0 2 5 4 6]

current_degree : [2 1 0]

current_status : [0 3 1 2 4]

job_type_preference : [4 2 0 1 3]

job_experience :[0 3 1]




In [14]:
def to_array_from_string(x):
    return set([elem.strip().lower() for elem in x.split(',') if elem.strip()])

#extracts elements from the string and stores as a set
df['known_languages'] = df['known_languages'].apply(lambda x : to_array_from_string(x))
df['frameworks_known'] = df["frameworks_known"].apply(lambda x : to_array_from_string(x))

In [15]:
df

Unnamed: 0,interest_area,coding_interest_level,preferred_domain,current_degree,field_of_study,current_status,known_languages,frameworks_known,job_type_preference,job_experience,recommended_job_role
0,1,0,3,2,0,0,"{javascript, java, sql, python, c++, c}","{flask, pytorch, scikit-learn}",4,0,Data Analyst
2,2,1,1,1,0,0,{java},{django},2,3,Backend Developer
3,2,0,1,2,1,3,{javascript},"{flask, pytorch, scikit-learn}",2,3,Developer
4,0,2,0,1,1,1,"{javascript, java, r}","{react, tensorflow}",2,1,Frontend Developer
5,0,0,1,2,0,0,{python},"{tensorflow, scikit-learn}",2,3,Backend Developer
...,...,...,...,...,...,...,...,...,...,...,...
94,2,1,2,2,1,0,"{java, c, r}",{scikit-learn},2,3,ML Engineer
95,2,1,4,2,0,4,"{javascript, c++, sql, r}","{flask, node}",4,3,DevOps
96,2,0,5,1,0,2,{java},"{react, flask}",3,3,Data Analyst
97,1,2,0,1,1,2,{c},{flask},3,0,Cybersecurity Analyst


In [16]:
#attributes for known language 
df['python'] = 0
df['c'] = 0
df['c++'] = 0
df['sql'] = 0
df['java'] = 0
df['javascript'] = 0
df['r'] = 0

#attributes for frameworks

df['react'] = 0
df['node'] = 0
df['flask'] = 0
df['django'] = 0
df['tensorflow'] = 0
df['pytorch'] = 0
df['scikit-learn'] = 0

In [17]:
#one hot encoding for all the languages known (manually)

all_languages = {"python", "javascript", "java", "c++", "c", "r", "sql", "none"}
for lang in all_languages:
    df[lang] = df['known_languages'].apply(lambda x: 1 if lang in x else 0)


#one hot encoding for all the frameworks known(manually)
all_frameworks = {"react" , "node" , "flask" , "django" , "tensorflow" , "pytorch" , "scikit-learn"}
for framework in all_frameworks:
    df[framework] = df['frameworks_known'].apply(lambda x: 1 if framework in x else 0)

In [18]:
df

Unnamed: 0,interest_area,coding_interest_level,preferred_domain,current_degree,field_of_study,current_status,known_languages,frameworks_known,job_type_preference,job_experience,...,javascript,r,react,node,flask,django,tensorflow,pytorch,scikit-learn,none
0,1,0,3,2,0,0,"{javascript, java, sql, python, c++, c}","{flask, pytorch, scikit-learn}",4,0,...,1,0,0,0,1,0,0,1,1,0
2,2,1,1,1,0,0,{java},{django},2,3,...,0,0,0,0,0,1,0,0,0,0
3,2,0,1,2,1,3,{javascript},"{flask, pytorch, scikit-learn}",2,3,...,1,0,0,0,1,0,0,1,1,0
4,0,2,0,1,1,1,"{javascript, java, r}","{react, tensorflow}",2,1,...,1,1,1,0,0,0,1,0,0,0
5,0,0,1,2,0,0,{python},"{tensorflow, scikit-learn}",2,3,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,2,1,2,2,1,0,"{java, c, r}",{scikit-learn},2,3,...,0,1,0,0,0,0,0,0,1,0
95,2,1,4,2,0,4,"{javascript, c++, sql, r}","{flask, node}",4,3,...,1,1,0,1,1,0,0,0,0,0
96,2,0,5,1,0,2,{java},"{react, flask}",3,3,...,0,0,1,0,1,0,0,0,0,0
97,1,2,0,1,1,2,{c},{flask},3,0,...,0,0,0,0,1,0,0,0,0,0


In [19]:
#reduction of attributes for precise output
#combining similar attributes to one

df['python_r'] = df['python'] +df['r']
df['java_cpp'] = df['java'] + df['c++']
df['tf_pt_sk'] = df['tensorflow'] + df['pytorch'] + df['scikit-learn']
df['django_node'] = df['django'] + df['node']

df['python_r'] = df['python_r'].apply(lambda x: x if x < 1 else 1)
df['java_cpp'] = df['java_cpp'].apply(lambda x: x if x < 1 else 1)
df['tf_pt_sk'] = df['tf_pt_sk'].apply(lambda x: x if x < 1 else 1)
df['django_node'] = df['django_node'].apply(lambda x: x if x < 1 else 1)

In [20]:
df['recommended_job_role'] = df['recommended_job_role'].apply(lambda x : x.lower().strip())
df['recommended_job_role'].unique()

array(['data analyst', 'backend developer', 'developer',
       'frontend developer', 'cybersecurity analyst', 'intern', 'devops',
       'ml engineer', 'ml intern', 'frontend intern', 'software engineer',
       'developer intern', 'backend intern', 'full stack intern'],
      dtype=object)

In [21]:
df.columns

Index(['interest_area', 'coding_interest_level', 'preferred_domain',
       'current_degree', 'field_of_study', 'current_status', 'known_languages',
       'frameworks_known', 'job_type_preference', 'job_experience',
       'recommended_job_role', 'python', 'c', 'c++', 'sql', 'java',
       'javascript', 'r', 'react', 'node', 'flask', 'django', 'tensorflow',
       'pytorch', 'scikit-learn', 'none', 'python_r', 'java_cpp', 'tf_pt_sk',
       'django_node'],
      dtype='object')

In [22]:
drop_columns = ['known_languages',
       'frameworks_known', 'python','c++', 'java',
        'r', 'django', 'node', 'tensorflow',
       'pytorch', 'scikit-learn', 'none',
       'recommended_job_role']


X = df.drop(drop_columns, axis = 1)
y = df['recommended_job_role']
y = le.fit_transform(y)
target_mapping = dict(zip(le.transform(le.classes_),le.classes_)) #for post-processing output

target_mapping = {int(k): v for k, v in target_mapping.items()} # Fix for keys: convert them to int from int32


In [23]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X , y , test_size = 0.1)

In [24]:
x_train.columns

Index(['interest_area', 'coding_interest_level', 'preferred_domain',
       'current_degree', 'field_of_study', 'current_status',
       'job_type_preference', 'job_experience', 'c', 'sql', 'javascript',
       'react', 'flask', 'python_r', 'java_cpp', 'tf_pt_sk', 'django_node'],
      dtype='object')

In [25]:
#GridSearchCV to find the best model and parameters
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


models = [
    ("LogisticRegression", LogisticRegression(), {
        "C": [0.01, 0.1, 1],
        "solver": ["liblinear"]
    }),
    ("RandomForest", RandomForestClassifier(), {
        "n_estimators": [100, 200],
        "max_depth": [5, 10]
    }),
    ("SVM", SVC(), {
        "C": [1, 10],
        "kernel": ["rbf", "linear"]
    }),
    ("DecisionTree", DecisionTreeClassifier(), {
        "criterion": ["gini", "entropy"],
        "max_depth": [5, 10, 15]
    }),
    ("NaiveBayes", GaussianNB(), {
        # No hyperparameters for basic NB
    }),
    ("KNN", KNeighborsClassifier(), {
        "n_neighbors": [3, 5, 7]
    }),
    ("GradientBoosting", GradientBoostingClassifier(), {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1]
    }),
    ("AdaBoost", AdaBoostClassifier(), {
        "n_estimators": [50, 100],
        "learning_rate": [0.01, 0.1, 1]
    })
]

results = []

for name, model, params in models:
    grid = GridSearchCV(model, params, cv=5, n_jobs=-1)
    grid.fit(x_train, y_train)
    results.append({
        "model": name,
        "best_score": grid.best_score_,
        "best_params": grid.best_params_
    })

df_results = pd.DataFrame(results)
print(df_results)


                model  best_score                                  best_params
0  LogisticRegression    0.196324              {'C': 1, 'solver': 'liblinear'}
1        RandomForest    0.243382        {'max_depth': 5, 'n_estimators': 100}
2                 SVM    0.231618                   {'C': 10, 'kernel': 'rbf'}
3        DecisionTree    0.388235    {'criterion': 'entropy', 'max_depth': 10}
4          NaiveBayes    0.158088                                           {}
5                 KNN    0.205882                           {'n_neighbors': 7}
6    GradientBoosting    0.254412  {'learning_rate': 0.1, 'n_estimators': 100}
7            AdaBoost    0.290441  {'learning_rate': 0.01, 'n_estimators': 50}


In [69]:
#selecting the best model for training
model = AdaBoostClassifier(learning_rate = 0.01, n_estimators= 100)
model.fit(x_train , y_train )
model.score(x_test , y_test)

0.5

In [27]:
x_test.iloc[9]

interest_area            1
coding_interest_level    2
preferred_domain         6
current_degree           1
field_of_study           0
current_status           1
job_type_preference      2
job_experience           0
c                        0
sql                      1
javascript               0
react                    0
flask                    1
python_r                 1
java_cpp                 0
tf_pt_sk                 1
django_node              0
Name: 16, dtype: int64

In [77]:
#saving model
import pickle 
with open (r'model.pkl' , 'wb') as f:
    pickle.dump(model,f)
    
#saving the labelencoder objects for each attributes used
with open (r'encoders.pkl' , 'wb') as f:
    pickle.dump(encoders,f)

#saving the dictionary for post-processing
import json
with open('target_mapping.json' , 'w') as f:
    json.dump(target_mapping, f, indent=4)

In [89]:
def load_model():
    with open(r'model/model.pkl','rb') as f:
        model = pickle.load(f)
    with open(r'model/encoders.pkl','rb') as f:
        encoders = pickle.load(f)
    return model , encoders

In [91]:
def predict_job(input_data):
    """
    Predicts the output for a single input sample using an AdaBoost model.

    Parameters:
        input_data (list or np.array): Input features (length must match training features).
        model (sklearn AdaBoostClassifier): Trained AdaBoost model.

    Returns:
        int or str: Predicted class label.
    """
    model , encoders = load_model()
    input_array = np.array(input_data).reshape(1, -1)
    prediction = model.predict(input_array)
    return target_mapping[prediction[0]]


In [93]:
predict_job([0, 2, 3, 0, 0, 2, 2, 3, 0, 0, 0, 1, 1, 1, 1, 1, 0])

'data analyst'