In [124]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [125]:
pip install rapidfuzz



In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [127]:
import pandas as pd

# Load merged dataset
combined_data = pd.read_csv('/content/drive/MyDrive/diploma/merged_data.csv')

In [128]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37501 entries, 0 to 37500
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   projectDomain       37476 non-null  object 
 1   currentLocation     37500 non-null  object 
 2   mainSpecialization  25145 non-null  object 
 3   englishProficiency  37476 non-null  object 
 4   companySizeUA       37476 non-null  object 
 5   age                 37499 non-null  float64
 6   gender              37500 non-null  object 
 7   salary              37444 non-null  float64
 8   companyMainArea     37476 non-null  object 
 9   experience          37501 non-null  int64  
 10  mainPosition        37476 non-null  object 
 11  currentRegion       34777 non-null  object 
 12  employmentType      37500 non-null  object 
 13  educationLevel      37476 non-null  object 
 14  jobTitle            37476 non-null  object 
dtypes: float64(2), int64(1), object(12)
memory usage: 4.3

#Preprocessing

In [129]:
# Removes salary outliers within each group using an adjusted interquartile range (IQR) filter to clean the data
def remove_salary_outliers(data, salary_col, group_col):
    cleaned_data = []
    for group, subset in data.groupby(group_col):
        Q1 = subset[salary_col].quantile(0.25)
        Q3 = subset[salary_col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 0.8 * IQR
        upper_bound = Q3 + 0.8 * IQR
        cleaned_subset = subset[(subset[salary_col] >= lower_bound) & (subset[salary_col] <= upper_bound)]
        cleaned_data.append(cleaned_subset)
    return pd.concat(cleaned_data)


In [130]:
categorical_columns = ['projectDomain', 'currentLocation', 'mainSpecialization', 'englishProficiency',
                       'companySizeUA', 'gender', 'companyMainArea', 'experience', 'mainPosition',
                       'currentRegion', 'employmentType', 'educationLevel', 'jobTitle']

##projectDomain

In [131]:
combined_data['projectDomain'].nunique()

2674

In [132]:
# Replace 'Other' value in the 'projectDomain' column
combined_data['projectDomain'] = combined_data['projectDomain'].replace('Інша', 'Other')


In [133]:
domains = combined_data['projectDomain'].value_counts()
domains

Unnamed: 0_level_0,count
projectDomain,Unnamed: 1_level_1
Other,3991
E-commerce,3427
Fintech / Banking / Capital Management,3208
Medtech / Healthcare,1768
GameDev,1653
...,...
"Edtech / Education, Legal, Medtech / Healthcare",1
"Blockchain, Public services / Government",1
"Machine Learning, Mobile, Інша",1
"Edtech / Education, Insurance, Internet of Things",1


In [134]:
 combined_data['projectDomain'].value_counts().head(25)

Unnamed: 0_level_0,count
projectDomain,Unnamed: 1_level_1
Other,3991
E-commerce,3427
Fintech / Banking / Capital Management,3208
Medtech / Healthcare,1768
GameDev,1653
Gambling,1317
Edtech / Education,964
Telecom,678
Media,617
Logistics,605


In [135]:
#Filters out rows with missing project domain values and extracts the list of unique project domains

combined_data = combined_data.dropna(subset=['projectDomain'])
unique_domains = combined_data['projectDomain'].unique()

In [136]:
print(combined_data.isnull().sum())

projectDomain             0
currentLocation           0
mainSpecialization    12331
englishProficiency        0
companySizeUA             0
age                       1
gender                    0
salary                   32
companyMainArea           0
experience                0
mainPosition              0
currentRegion          2699
employmentType            0
educationLevel            0
jobTitle                  0
dtype: int64


In [137]:
# groups similar project domains based on string similarity using the rapidfuzz library
import pandas as pd
from rapidfuzz import process, fuzz

# Get unique domain names
unique_domains = combined_data['projectDomain'].dropna().unique()

# Set the similarity threshold
similarity_threshold = 40

# Create a dictionary to map each domain to its group
domain_groups = {}

# Dictionary for grouping
domain_mapping = {}

# Iterate through each domain and find the most similar ones
for domain in unique_domains:
    if domain_groups:  # Ensure there is something to compare with
        match = process.extractOne(domain, domain_groups.keys(), scorer=fuzz.ratio)
    else:
        match = None  # If there are no groups yet, leave it as None

    if match is not None and match[1] >= similarity_threshold:
        domain_groups[match[0]].append(domain)
        domain_mapping[domain] = match[0]  # Assign the group
    else:
        domain_groups[domain] = [domain]
        domain_mapping[domain] = domain  # Assign as a new group independently


combined_data['projectDomain'] = combined_data['projectDomain'].map(domain_mapping)

category_counts = combined_data['projectDomain'].value_counts()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data['projectDomain'] = combined_data['projectDomain'].map(domain_mapping)


In [138]:
# create a list of categories with less than 200 values
small_categories = category_counts[category_counts < 200].index.tolist()
small_categories

['Legal, Travel', 'AR/VR']

In [139]:
# Replace these categories with "Other"
combined_data['projectDomain'] = combined_data['projectDomain'].apply(
    lambda x: 'Other' if x in small_categories else x
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data['projectDomain'] = combined_data['projectDomain'].apply(


In [140]:
category_counts = combined_data['projectDomain'].value_counts()

In [141]:
category_counts

Unnamed: 0_level_0,count
projectDomain,Unnamed: 1_level_1
Fintech / Banking / Capital Management,5354
E-commerce,5194
Medtech / Healthcare,4620
Other,4164
"Big Data, Data Science, Gambling",2421
GameDev,2215
"Blockchain, Telecom",1531
"Internet of Things, Security",1418
"Geospatial, Mobile, Інша",1304
Retail,972


In [142]:
# Removal of anomalies
combined_data = remove_salary_outliers(combined_data, 'salary', 'projectDomain')

In [143]:
combined_data.columns

Index(['projectDomain', 'currentLocation', 'mainSpecialization',
       'englishProficiency', 'companySizeUA', 'age', 'gender', 'salary',
       'companyMainArea', 'experience', 'mainPosition', 'currentRegion',
       'employmentType', 'educationLevel', 'jobTitle'],
      dtype='object')

##mainSpecialization

In [144]:
print(combined_data.isnull().sum())

projectDomain             0
currentLocation           0
mainSpecialization    11996
englishProficiency        0
companySizeUA             0
age                       1
gender                    0
salary                    0
companyMainArea           0
experience                0
mainPosition              0
currentRegion          2387
employmentType            0
educationLevel            0
jobTitle                  0
dtype: int64


In [145]:
unique_specializations = combined_data['mainSpecialization'].unique()

print(unique_specializations)

['Full Stack розробка' nan 'Front-end  розробка' 'Back-end  розробка'
 'Робота з даними, аналіз даних' 'DevOps' 'Mobile  розробка' 'QA' 'Інше'
 'BI-розробка' 'Front-end розробка' 'Back-end розробка' 'Mobile розробка'
 'Robotics/hardware ' 'ML' 'розробка на платформі BAF, 1C' 'embedded'
 'Automotive' 'Embedded' 'С++' 'Atlassian Administrator / JIRA Admin'
 'Firmware ' 'Embedded / Automotive' 'middleware' 'Algorithm Engineer'
 'product manager' 'Support' 'Desktop' 'Gamedev' 'ProdOps, SRE'
 'Навчання та інтеграція моделей ШІ' 'ML/DL/MLOps' 'Системний аналіз'
 'Tech/integration support' 'MLOps' 'Hardware' 'Security Specialist'
 'Integration' 'Unreal Engine' 'AI/ML research' 'Data Science' 'AI'
 'Security' 'Embedded engineer' 'SysAdmin' 'Support L1' 'network engineer'
 'sysadmin' 'я просто ВА якому прийшлось трохи розбиратись з кодом' 'PM'
 'Low-code розробка' 'Azure Cloud Engineering'
 'Розробка потрібного продукту. Часом фронт, часом бек'
 'SysOps \\ SysAdmin, розробка інтеграцій' 'AI / B

In [146]:
len(unique_specializations)

185

In [147]:
# Normalize the values ​​in the 'mainSpecialization' column
combined_data['mainSpecialization'] = combined_data['mainSpecialization'].str.strip().str.lower()

# Combining similar values
specialization_mapping = {
    'back-end розробка': 'Back-end розробка',
    'back-end  розробка': 'Back-end розробка',
    'front-end розробка': 'Front-end розробка',
    'front-end  розробка': 'Front-end розробка',
    'mobile розробка': 'Mobile розробка',
    'mobile  розробка': 'Mobile розробка',
    'embedded': 'Embedded',
    'embedded ': 'Embedded',
    'embedded  ': 'Embedded',
    'інше':'Other'
}
combined_data['mainSpecialization'] = combined_data['mainSpecialization'].replace(specialization_mapping)

combined_data['mainSpecialization'].unique()


array(['full stack розробка', nan, 'Front-end розробка',
       'Back-end розробка', 'робота з даними, аналіз даних', 'devops',
       'Mobile розробка', 'qa', 'Other', 'bi-розробка',
       'robotics/hardware', 'ml', 'розробка на платформі baf, 1c',
       'Embedded', 'automotive', 'с++',
       'atlassian administrator / jira admin', 'firmware',
       'embedded / automotive', 'middleware', 'algorithm engineer',
       'product manager', 'support', 'desktop', 'gamedev', 'prodops, sre',
       'навчання та інтеграція моделей ші', 'ml/dl/mlops',
       'системний аналіз', 'tech/integration support', 'mlops',
       'hardware', 'security specialist', 'integration', 'unreal engine',
       'ai/ml research', 'data science', 'ai', 'security',
       'embedded engineer', 'sysadmin', 'support l1', 'network engineer',
       'я просто ва якому прийшлось трохи розбиратись з кодом', 'pm',
       'low-code розробка', 'azure cloud engineering',
       'розробка потрібного продукту. часом фронт, ч

In [148]:
#get the number of values ​​for each specialization and sort in descending order
specialization_counts = combined_data['mainSpecialization'].value_counts().sort_values(ascending=False)

specialization_counts


Unnamed: 0_level_0,count
mainSpecialization,Unnamed: 1_level_1
Back-end розробка,6803
Front-end розробка,4059
full stack розробка,3530
qa,3247
Mobile розробка,1677
...,...
industrial automation,1
"semiconductors, ate equipment",1
senior product manager,1
розробляю тестові фреймворки,1


In [149]:
specialization_counts.head(25)

Unnamed: 0_level_0,count
mainSpecialization,Unnamed: 1_level_1
Back-end розробка,6803
Front-end розробка,4059
full stack розробка,3530
qa,3247
Mobile розробка,1677
"робота з даними, аналіз даних",1497
devops,1105
Other,848
bi-розробка,153
Embedded,36


In [150]:
# updated_specialization_counts

In [151]:
# specialization names mapping
specialization_mapping_to_english = {
    'Back-end розробка': 'Back-end Development',
    'Front-end розробка': 'Front-end Development',
    'full stack розробка': 'Full Stack Development',
    'qa': 'QA',
    'Mobile розробка': 'Mobile Development',
    'робота з даними, аналіз даних': 'Data Engineering & Analytics',
    'devops': 'DevOps',
    'Other': 'Other'
}

combined_data['mainSpecialization'] = combined_data['mainSpecialization'].replace(specialization_mapping_to_english)

updated_specialization_counts = combined_data['mainSpecialization'].value_counts()
print(updated_specialization_counts)

mainSpecialization
Back-end Development             6803
Front-end Development            4059
Full Stack Development           3530
QA                               3247
Mobile Development               1677
                                 ... 
industrial automation               1
semiconductors, ate equipment       1
senior product manager              1
розробляю тестові фреймворки        1
написання скриптів                  1
Name: count, Length: 164, dtype: int64


In [152]:
updated_specialization_counts

Unnamed: 0_level_0,count
mainSpecialization,Unnamed: 1_level_1
Back-end Development,6803
Front-end Development,4059
Full Stack Development,3530
QA,3247
Mobile Development,1677
...,...
industrial automation,1
"semiconductors, ate equipment",1
senior product manager,1
розробляю тестові фреймворки,1


In [153]:
combined_data = remove_salary_outliers(combined_data, 'salary', 'mainSpecialization')

In [154]:
combined_data['mainSpecialization'].value_counts()

Unnamed: 0_level_0,count
mainSpecialization,Unnamed: 1_level_1
Back-end Development,6780
Front-end Development,3975
Full Stack Development,3500
QA,3097
Mobile Development,1656
...,...
розробляю тестові фреймворки,1
різні проекти та завдання,1
с++,1
системний аналіз,1


##mainPosition

In [155]:
 print(combined_data['mainPosition'].value_counts())

mainPosition
Software Engineer                                                                              10719
Software Engineer / Programmer                                                                  5446
QA / AQA Engineer (Junior, Middle, Senior, Team/Tech Lead, Manager)                             2153
QA / AQA / QC Engineer (Junior, Middle, Senior, Team/Tech Lead, Manager)                         999
Data Science, Machine Learning, AI, Big Data, Data Engineer                                      788
DevOps, SRE                                                                                      626
Analyst (Business, Data, System etc)                                                             618
DevOps, SRE, Operations                                                                          342
Project/Product/Program/Delivery/Engineering Manager, Product Owner, Producer, Scrum Master      286
SysAdmin                                                                      

In [156]:
len(combined_data['mainPosition'].unique())

22

In [157]:
# groups similar project domains based on string similarity using the rapidfuzz library

from rapidfuzz import process, fuzz
import pandas as pd

unique_specializations = combined_data['mainPosition'].dropna().unique()

similarity_threshold = 65

specialization_groups = {}

for specialization in unique_specializations:
    match = process.extractOne(specialization, specialization_groups.keys(), scorer=fuzz.ratio)

    if match:
        matched_value, score, _ = match
        if score >= similarity_threshold:
            specialization_groups[matched_value].append(specialization)
        else:
            specialization_groups[specialization] = [specialization]
    else:
        specialization_groups[specialization] = [specialization]

specialization_mapping = {}
for main_category, similar_specializations in specialization_groups.items():
    for spec in similar_specializations:
        specialization_mapping[spec] = main_category

combined_data['mainPosition'] = combined_data['mainPosition'].map(specialization_mapping)

category_counts = combined_data['mainPosition'].value_counts()
print(category_counts)

mainPosition
Hardware Engineer                                                                              10746
Software Engineer / Programmer                                                                  5446
QA / AQA / QC Engineer (Junior, Middle, Senior, Team/Tech Lead, Manager)                        3152
Data Science, Machine Learning, AI, Big Data, Data Engineer                                      788
DevOps, SRE                                                                                      626
Analyst (Business, Data, System etc)                                                             618
DevOps, SRE, Operations                                                                          342
Project/Product/Program/Delivery/Engineering Manager, Product Owner, Producer, Scrum Master      286
SysAdmin                                                                                         152
Support (Customer, Technical, Community)                                      

In [158]:
 print(combined_data['mainPosition'].value_counts())

mainPosition
Hardware Engineer                                                                              10746
Software Engineer / Programmer                                                                  5446
QA / AQA / QC Engineer (Junior, Middle, Senior, Team/Tech Lead, Manager)                        3152
Data Science, Machine Learning, AI, Big Data, Data Engineer                                      788
DevOps, SRE                                                                                      626
Analyst (Business, Data, System etc)                                                             618
DevOps, SRE, Operations                                                                          342
Project/Product/Program/Delivery/Engineering Manager, Product Owner, Producer, Scrum Master      286
SysAdmin                                                                                         152
Support (Customer, Technical, Community)                                      

In [159]:
position_counts = combined_data['mainPosition'].value_counts()
insignificant_positions = position_counts[position_counts < 200].index

# Replace minor values ​​with "other"
combined_data['mainPosition'] = combined_data['mainPosition'].apply(
    lambda x: 'Other' if x in insignificant_positions else x
)


updated_position_counts = combined_data['mainPosition'].value_counts()


In [160]:
updated_position_counts

Unnamed: 0_level_0,count
mainPosition,Unnamed: 1_level_1
Hardware Engineer,10746
Software Engineer / Programmer,5446
"QA / AQA / QC Engineer (Junior, Middle, Senior, Team/Tech Lead, Manager)",3152
"Data Science, Machine Learning, AI, Big Data, Data Engineer",788
Other,694
"DevOps, SRE",626
"Analyst (Business, Data, System etc)",618
"DevOps, SRE, Operations",342
"Project/Product/Program/Delivery/Engineering Manager, Product Owner, Producer, Scrum Master",286


In [161]:
combined_data = remove_salary_outliers(combined_data, 'salary', 'mainPosition')

##englishProficiency

In [162]:
# A dictionary for combining similar categories
proficiency_mapping = {
    'Upper-Intermediate': 'Upper-Intermediate',
    '5. Upper-Intermediate': 'Upper-Intermediate',
    'Intermediate': 'Intermediate',
    '4. Intermediate': 'Intermediate',
    'Advanced': 'Advanced',
    '6. Advanced': 'Advanced',
    'Pre-Intermediate': 'Pre-Intermediate',
    '3. Pre-Intermediate': 'Pre-Intermediate',
    'Elementary': 'Elementary',
    '2. Elementary': 'Elementary',
    'Не знаю взагалі': 'No Proficiency',
    '1. Не знаю взагалі': 'No Proficiency'
}


combined_data['englishProficiency'] = combined_data['englishProficiency'].map(proficiency_mapping)


##currentLocation

In [163]:
# A dictionary for combining similar categories
location_mapping = {
    'За кордоном (переїхали через війну, але планую повернутися)': 'Abroad (plan to return)',
    'За кордоном, але планую повернутися': 'Abroad (plan to return)',
    'Планують повернутися': 'Abroad (plan to return)',
    'В Україні': 'In Ukraine'
}

combined_data['currentLocation'] = combined_data['currentLocation'].map(location_mapping)

##company_size

In [164]:
# A dictionary for combining similar categories
company_size_mapping = {
    'до 200': 'Up to 200',
    '4. до 200': 'Up to 200',
    'понад 1000': 'Over 1000',
    '6. понад 1000': 'Over 1000',
    'до 50': 'Up to 50',
    '3. до 50': 'Up to 50',
    'до 1000': 'Up to 1000',
    '5. до 1000': 'Up to 1000',
    'до 10 спеціалістів': 'Up to 10 specialists',
    '2. до 10 спеціалістів': 'Up to 10 specialists',
    'Лише я / фріланс': 'Only me / Freelance',
    '1. Лише я / фріланс': 'Only me / Freelance'
}

combined_data['companySizeUA'] = combined_data['companySizeUA'].map(company_size_mapping)

##company_main_area

In [165]:
# A dictionary for combining similar categories
company_main_area_mapping = {
    'Продуктова': 'Product',
    'Аутсорсингова': 'Outsourcing',
    'Сервісна (аутсорсингова)': 'Outsourcing',
    'Аутстафінгова': 'Outstaffing',
    'Стартап': 'Startup',
    'Інша': 'Other',
    'Фріланс': 'Freelance',
    'Державне підприємство / установа': 'Government Entity'
}

combined_data['companyMainArea'] = combined_data['companyMainArea'].map(company_main_area_mapping)

##region_mapping

In [166]:
# A dictionary for combining similar categories
region_mapping = {
    # Western Ukraine
    'Львів чи область': 'Western Ukraine',
    'Івано-Франківськ чи область': 'Western Ukraine',
    'Тернопіль чи область': 'Western Ukraine',
    'Ужгород чи Закарпатська область': 'Western Ukraine',
    'Чернівці чи область': 'Western Ukraine',
    'Рівне чи область': 'Western Ukraine',
    'Луцьк чи Волинська область': 'Western Ukraine',

    # Central Ukraine
    'Київ чи область': 'Central Ukraine',
    'Вінниця чи область': 'Central Ukraine',
    'Черкаси чи область': 'Central Ukraine',
    'Полтава чи область': 'Central Ukraine',
    'Хмельницький чи область': 'Central Ukraine',
    'Житомир чи область': 'Central Ukraine',
    'Кропивницький чи область': 'Central Ukraine',

    # Eastern Ukraine
    'Харків чи область': 'Eastern Ukraine',
    'Дніпро чи область': 'Eastern Ukraine',
    'Запоріжжя чи область': 'Eastern Ukraine',
    'Суми чи область': 'Eastern Ukraine',
    'Чернігів чи область': 'Eastern Ukraine',

    # Southern Ukraine
    'Одеса чи область': 'Southern Ukraine',
    'Миколаїв чи область': 'Southern Ukraine',
    'Херсон чи область': 'Southern Ukraine',

    # Other
    'Інше': 'Other',
    'Донецька чи Луганська область': 'Other',
    'Не в Україні': 'Other',
}

combined_data['currentRegion'] = combined_data['currentRegion'].map(region_mapping)

##gender

In [167]:
# A dictionary for combining similar categories
gender_mapping = {
    'Чоловік': 'Male',
    'Жінка': 'Female'
}

combined_data['gender'] = combined_data['gender'].map(gender_mapping)

##employmentType

In [168]:
# A dictionary for combining similar categories
employment_type_mapping = {
    'Працюю full-time в ІТ-компанії чи ІТ-відділі': 'Full-time IT employee',
    'Втратив(-ла) роботу в ІТ і шукаю нову': 'Unemployed (looking for IT job)',
    'Працюю part-time в ІТ-компанії чи ІТ-відділі': 'Part-time IT employee',
    'Я фрилансер(-ка) в ІТ': 'Freelancer in IT',
    'Зараз на повністю оплачуваному бенчі в ІТ-компанії': 'On bench (paid/unpaid)',
    'Зараз в неоплачуваній відпустці / бенчі в ІТ-компанії': 'On bench (paid/unpaid)',
    'Зараз на частково оплачуваному бенчі в ІТ-компанії': 'On bench (paid/unpaid)',
    'Тимчасово не працюю і не шукаю роботу (sabbatical, декрет, волонтерство etc)': 'Not working (temporary)',
    'Працював(-ла) в IT, нині на військовій службі': 'Military service (ex-IT)'
}

combined_data['employmentType'] = combined_data['employmentType'].map(employment_type_mapping)

##educationLevel

In [169]:
# A dictionary for combining similar categories
education_level_mapping = {
    'Вища (бакалавр, спеціаліст, магістр) - одна або декілька': 'Higher Education (Bachelor, Specialist, Master)',
    '5. Вища (бакалавр, спеціаліст, магістр) - одна або декілька': 'Higher Education (Bachelor, Specialist, Master)',
    'Незакінчена вища': 'Incomplete Higher Education',
    '4. Незакінчена вища': 'Incomplete Higher Education',
    'Ще студент вишу': 'Current University Student',
    '3. Ще студент вишу': 'Current University Student',
    'Середня спеціальна': 'Secondary Specialized Education',
    '2. Середня спеціальна': 'Secondary Specialized Education',
    'Середня': 'Secondary Education',
    '1. Середня': 'Secondary Education',
    'Науковий ступінь (кандидат / доктор наук, PhD)': 'PhD or Doctorate',
    '6. Науковий ступінь (кандидат / доктор наук, PhD)': 'PhD or Doctorate'
}

combined_data['educationLevel'] = combined_data['educationLevel'].map(education_level_mapping)

In [170]:
combined_data = remove_salary_outliers(combined_data, 'salary', 'educationLevel')

print(combined_data['salary'].describe())

count    22097.000000
mean      2929.772346
std       1595.859407
min         50.000000
25%       1500.000000
50%       2900.000000
75%       4200.000000
max       6300.000000
Name: salary, dtype: float64


##jobTitle

In [171]:
print(combined_data['jobTitle'].value_counts())

jobTitle
Middle            8545
Senior            6502
Junior            3894
Team Lead         1371
Tech Lead          786
Немає тайтлу       399
Intern/Trainee     220
Architect          142
Manager            127
Head               111
Name: count, dtype: int64


In [172]:
# A dictionary for combining similar categories
job_title_mapping = {
    "Junior": "Junior",
    "Intern/Trainee": "Junior",
    "Middle": "Middle",
    "Senior": "Senior",
    "Tech Lead": "Senior",
    "Architect": "Senior",
    "Team Lead": "Lead/Manager",
    "Manager": "Lead/Manager",
    "Head": "Lead/Manager",
    "Немає тайтлу": "No Title"
}

combined_data['jobTitle'] = combined_data['jobTitle'].map(job_title_mapping)

In [173]:
print(combined_data['jobTitle'].value_counts())

jobTitle
Middle          8545
Senior          7430
Junior          4114
Lead/Manager    1609
No Title         399
Name: count, dtype: int64


In [174]:
combined_data = remove_salary_outliers(combined_data, 'salary', 'jobTitle')

print(combined_data['salary'].describe())

count    20754.000000
mean      2928.475205
std       1584.074012
min        100.000000
25%       1500.000000
50%       2937.500000
75%       4100.000000
max       6300.000000
Name: salary, dtype: float64


##Experience

In [175]:
print(combined_data['experience'].describe())
print(combined_data['experience'].value_counts())

count    20754.000000
mean         4.720006
std          3.498869
min          0.000000
25%          2.000000
50%          4.000000
75%          6.000000
max         15.000000
Name: experience, dtype: float64
experience
3     3155
2     2918
5     2389
4     2356
1     2115
6     1688
7     1273
0     1122
8      910
10     772
15     629
9      500
12     337
11     309
13     177
14     104
Name: count, dtype: int64


In [176]:
#removing outliers
combined_data = remove_salary_outliers(combined_data, 'salary', 'experience')

In [177]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19079 entries, 4180 to 24983
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   projectDomain       19079 non-null  object 
 1   currentLocation     19079 non-null  object 
 2   mainSpecialization  19079 non-null  object 
 3   englishProficiency  19079 non-null  object 
 4   companySizeUA       19079 non-null  object 
 5   age                 19078 non-null  float64
 6   gender              19079 non-null  object 
 7   salary              19079 non-null  float64
 8   companyMainArea     19079 non-null  object 
 9   experience          19079 non-null  int64  
 10  mainPosition        19079 non-null  object 
 11  currentRegion       17916 non-null  object 
 12  employmentType      19079 non-null  object 
 13  educationLevel      19079 non-null  object 
 14  jobTitle            19079 non-null  object 
dtypes: float64(2), int64(1), object(12)
memory usage: 2.3+ 

##age

In [178]:
print(combined_data['age'].value_counts())

age
27.0    1140
25.0    1130
28.0    1093
26.0    1092
29.0    1065
23.0    1061
30.0    1047
24.0     970
33.0     952
32.0     932
22.0     919
31.0     889
35.0     819
34.0     818
21.0     729
36.0     688
37.0     571
20.0     511
38.0     496
39.0     361
40.0     353
19.0     224
41.0     220
42.0     218
43.0     170
44.0     116
45.0     111
18.0      68
46.0      61
47.0      56
48.0      46
49.0      32
50.0      30
53.0      15
52.0      14
17.0      12
54.0      11
51.0      10
56.0       7
16.0       4
55.0       4
59.0       2
57.0       2
61.0       2
58.0       2
60.0       2
62.0       2
63.0       1
Name: count, dtype: int64


In [179]:
combined_data = remove_salary_outliers(combined_data, 'salary', 'age')

## mainSpecialization other

In [180]:
combined_data_copy = combined_data.copy()

In [181]:
print(combined_data_copy['mainSpecialization'].value_counts())

mainSpecialization
Back-end Development      5354
Front-end Development     3443
Full Stack Development    2821
QA                        2715
Mobile Development        1391
                          ... 
business analyst             1
packaging                    1
prodops, sre                 1
embedded developer           1
aqa services                 1
Name: count, Length: 127, dtype: int64


In [182]:
print(combined_data.isnull().sum())

projectDomain            0
currentLocation          0
mainSpecialization       0
englishProficiency       0
companySizeUA            0
age                      0
gender                   0
salary                   0
companyMainArea          0
experience               0
mainPosition             0
currentRegion         1137
employmentType           0
educationLevel           0
jobTitle                 0
dtype: int64


In [183]:
unique_specializations = combined_data['mainSpecialization'].unique()

len(unique_specializations)

127

In [184]:
!pip install fuzzywuzzy python-Levenshtein



In [185]:
from fuzzywuzzy import fuzz
import pandas as pd

# Function to group similar category names
def group_similar_categories(categories, threshold=40):
    grouped = []
    used = set()

    for cat in categories:
        if cat not in used:
            similar = [cat]
            used.add(cat)
            for other in categories:
                if other not in used:
                    # Calculate similarity between names
                    if fuzz.ratio(cat.lower(), other.lower()) > threshold:
                        similar.append(other)
                        used.add(other)
            grouped.append((cat, similar))
    return grouped

# Get unique category values
categories = combined_data['mainSpecialization'].unique()
grouped_categories = group_similar_categories(categories)

# Create mapping dictionary
mapping = {}
for main_cat, similar_cats in grouped_categories:
    for cat in similar_cats:
        mapping[cat] = main_cat

# Apply the mapping
combined_data['mainSpecialization'] = combined_data['mainSpecialization'].map(mapping)

# Check the result
unique_specializations = combined_data['mainSpecialization'].unique()
print(f"New number of unique values: {len(unique_specializations)}")
print(unique_specializations)

New number of unique values: 48
['Front-end Development' 'Other' 'DevOps' 'bi-розробка' 'system admin'
 'Data Engineering & Analytics' 'QA' 'gameplay, back-end' 'vr' 'hardware'
 'salesforce' 'автоматизація процесів' 'Embedded' 'robotics'
 'розробка потрібного продукту. часом фронт, часом бек' 'security'
 'робота з даними, налагодження роботи системи, моніторинг та ескалація, скриптування для спрощення роботи'
 'everything related to ml, starting with the web and ending with a research'
 'product manager' 'atlassian administrator / jira admin'
 'render/engine programmer' 'не web' 'геймдизайн це шо з переліку?'
 'low-code/no-code' "в desktop'і це і фронт і бек." 'rpa' 'важко сказати'
 'gd' 'ml' 'все підряд' 'game' 'thechnical support'
 'software development for embedded linux on networking devices'
 'системний аналіз' 'ui логіка та шейдери в геймдеві, мабуть front-end'
 'smart tv' 'it ops (support and maintenance operation system - backend)'
 'генерація синтетичних даних для нейронок' 'a

In [186]:
combined_data = remove_salary_outliers(combined_data, 'salary', 'mainSpecialization')

print(combined_data['salary'].describe())

count    18458.000000
mean      2873.745533
std       1535.401611
min        150.000000
25%       1500.000000
50%       2850.000000
75%       4000.000000
max       6300.000000
Name: salary, dtype: float64


In [187]:
#Final mainSpecialization mapping
mapping_cleaned = {
    'Front-end Development': 'Front-end Development',
    'Other': 'Other',
    'DevOps': 'DevOps',
    'bi-розробка': 'BI Development',
    'system admin': 'System Administration',
    'Data Engineering & Analytics': 'Data Engineering & Analytics',
    'QA': 'Quality Assurance',
    'gameplay, back-end': 'Game Development',
    'vr': 'Game Development',
    'hardware': 'Hardware Engineering',
    'salesforce': 'Salesforce Development',
    'автоматизація процесів': 'Process Automation',
    'Embedded': 'Embedded Systems',
    'robotics': 'Robotics',
    'розробка потрібного продукту. часом фронт, часом бек': 'Full-stack Development',
    'security': 'Cybersecurity',
    'робота з даними, налагодження роботи системи, моніторинг та ескалація, скриптування для спрощення роботи': 'IT Operations',
    'everything related to ml, starting with the web and ending with a research': 'Machine Learning',
    'product manager': 'Product Management',
    'atlassian administrator / jira admin': 'Atlassian Admin',
    'render/engine programmer': 'Game Engine Programming',
    'не web': 'Non-Web Development',
    'геймдизайн це шо з переліку?': 'Game Design',
    'low-code/no-code': 'Low-code/No-code',
    "в desktop'і це і фронт і бек.": 'Desktop Development',
    'rpa': 'Process Automation',
    'важко сказати': 'Other',
    'gd': 'Game Development',
    'ml': 'Machine Learning',
    'все підряд': 'Full-stack Development',
    'game': 'Game Development',
    'thechnical support': 'Technical Support',
    'software development for embedded linux on networking devices': 'Embedded Linux Development',
    'системний аналіз': 'Systems Analysis',
    'ui логіка та шейдери в геймдеві, мабуть front-end': 'Game Front-end',
    'smart tv': 'Smart TV Development',
    'it ops (support and maintenance operation system - backend)': 'IT Operations',
    'генерація синтетичних даних для нейронок': 'Synthetic Data for ML',
    'algorithms': 'Algorithms & Data Structures',
    'десктоп аплікації': 'Desktop Development',
    'computer vision': 'Computer Vision',
    'я просто ва якому прийшлось трохи розбиратись з кодом': 'Other',
    'sysops \\ sysadmin, розробка інтеграцій': 'System Administration',
    'с++': 'C++ Development',
    'розробляю тестові фреймворки': 'QA / Test Frameworks',
    'pentest': 'Penetration Testing',
    'конструювання, 3d моделювання': '3D Design & Engineering',
    'бібліотеки? десктоп?': 'Desktop Development'
}

combined_data['mainSpecialization'] = combined_data['mainSpecialization'].map(mapping_cleaned)

# Check unique values after mapping
unique_specializations = combined_data['mainSpecialization'].unique()
print(f"Final number of unique specializations: {len(unique_specializations)}")
print(unique_specializations)

Final number of unique specializations: 36
['Data Engineering & Analytics' 'DevOps' 'Embedded Systems'
 'Front-end Development' 'Other' 'Quality Assurance'
 'Algorithms & Data Structures' 'Atlassian Admin' 'BI Development'
 'Computer Vision' 'Machine Learning' 'Game Development'
 'Hardware Engineering' 'IT Operations' 'Low-code/No-code'
 'Penetration Testing' 'Product Management' 'Game Engine Programming'
 'Robotics' 'Process Automation' 'Salesforce Development' 'Cybersecurity'
 'Smart TV Development' 'Embedded Linux Development'
 'System Administration' 'Technical Support' 'Game Front-end'
 'Desktop Development' 'Full-stack Development' 'Game Design'
 'Synthetic Data for ML' '3D Design & Engineering' 'Non-Web Development'
 'QA / Test Frameworks' 'C++ Development' 'Systems Analysis']


In [188]:
#Final optimization
mapping_optimized = {
    'Front-end Development': 'Front-end Development',
    'Game Front-end': 'Front-end Development',

    'Quality Assurance': 'Quality Assurance',
    'QA / Test Frameworks': 'Quality Assurance',

    'Data Engineering & Analytics': 'Data & BI',
    'BI Development': 'Data & BI',

    'DevOps': 'DevOps & IT Ops',
    'System Administration': 'DevOps & IT Ops',
    'IT Operations': 'DevOps & IT Ops',
    'Atlassian Admin': 'DevOps & IT Ops',

    'Embedded Systems': 'Embedded & Hardware',
    'Embedded Linux Development': 'Embedded & Hardware',
    'Hardware Engineering': 'Embedded & Hardware',

    'Game Development': 'Game Development',
    'Game Engine Programming': 'Game Development',
    'Game Design': 'Game Development',

    'Machine Learning': 'AI / ML',
    'Computer Vision': 'AI / ML',
    'Synthetic Data for ML': 'AI / ML',
    'Algorithms & Data Structures': 'AI / ML',

    'Cybersecurity': 'Cybersecurity',
    'Penetration Testing': 'Cybersecurity',

    'Process Automation': 'Automation & RPA',
    'Low-code/No-code': 'Automation & RPA',

    'Product Management': 'Product & Project Management',
    'Systems Analysis': 'Product & Project Management',

    'Desktop Development': 'Desktop Development',
    'Smart TV Development': 'Desktop Development',
    'Non-Web Development': 'Desktop Development',

    'Technical Support': 'Support & TechOps',

    'Robotics': 'Robotics & 3D',
    '3D Design & Engineering': 'Robotics & 3D',

    'C++ Development': 'Other',
    'Full-stack Development': 'Other',
    'Salesforce Development': 'Other',
    'Computer Vision': 'AI / ML',
    'Other': 'Other'
}
combined_data['mainSpecialization'] = combined_data['mainSpecialization'].map(mapping_optimized)


In [189]:
print(combined_data['mainSpecialization'].value_counts())

mainSpecialization
Front-end Development           13022
Quality Assurance                2716
Data & BI                        1215
DevOps & IT Ops                   816
Other                             596
Embedded & Hardware                38
Game Development                   17
Cybersecurity                       9
AI / ML                             6
Product & Project Management        6
Robotics & 3D                       6
Automation & RPA                    5
Desktop Development                 5
Support & TechOps                   1
Name: count, dtype: int64


# Pre function

In [190]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def preprocess_data(df, training=True, scaler=None, reference_columns=None, target_encoder=None, global_mean_salary=None):
    """
    Preprocess input DataFrame with increased target encoding smoothing and numerical binning.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.
    - training (bool): True for training mode, False for inference.
    - scaler (StandardScaler): Pre-fitted scaler for inference (default: None).
    - reference_columns (list): List of columns from training data for consistency (default: None).
    - target_encoder (dict): Pre-fitted target encoding mappings for inference (default: None).
    - global_mean_salary (float): Precomputed global mean salary for inference (default: None).

    Returns:
    - df_processed (pd.DataFrame): Processed DataFrame.
    - scaler (StandardScaler): Fitted or provided scaler.
    - target_encoder (dict): Fitted target encoding mappings.
    """
    # Define feature lists
    categorical_onehot = [
        'companyMainArea', 'currentRegion', 'employmentType',
        'mainPosition', 'projectDomain', 'mainSpecialization',
        'englishProficiency', 'companySizeUA', 'educationLevel', 'gender'
    ]
    categorical_target = ['jobTitle', 'currentLocation']  # High-cardinality features for target encoding
    numerical_features = ['salary', 'age', 'experience']  # Numerical features

    df = df.copy()

    # Debug: Log unique values in categorical columns
    for col in categorical_onehot + categorical_target:
        if col in df.columns:
            unique_vals = df[col].unique()
        else:
            logging.warning(f"Column {col} not found in DataFrame.")

    # Handle high-cardinality categorical features
    for col in categorical_onehot + categorical_target:
        if col in df.columns:
            # Keep top 10 most frequent categories, group others as 'Other'
            top_categories = df[col].value_counts().index[:10]
            df[col] = df[col].apply(lambda x: x if x in top_categories else 'Other')
        else:
            logging.warning(f"Column {col} not found in DataFrame.")

    # Handle missing values and add missing indicators
    for col in categorical_onehot + categorical_target:
        if col in df.columns:
            df[f'{col}_is_missing'] = df[col].isna().astype(int)
            df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown')
    for col in numerical_features:
        if col in df.columns:
            df[f'{col}_is_missing'] = df[col].isna().astype(int)
            df[col] = df[col].fillna(df[col].median())

    # Add missing indicator columns to one-hot encoding list
    categorical_onehot.extend([f'{col}_is_missing' for col in categorical_onehot + categorical_target + numerical_features])

    # Apply log transformation to numerical features
    for col in numerical_features:
        if col in df.columns:
            df[col] = np.log1p(df[col].clip(lower=0))

    # Bin numerical features (age, experience)
    for col in ['age', 'experience']:
        if col in df.columns:
            # Create 5 bins based on quantiles
            df[f'{col}_binned'] = pd.qcut(df[col], q=5, labels=False, duplicates='drop')
            categorical_onehot.append(f'{col}_binned')

    # Target encoding for high-cardinality features with increased smoothing
    if training:
        target_encoder = {}
        for col in categorical_target:
            if col in df.columns:
                # Compute mean salary per category with stronger smoothing (70% category mean + 30% global mean)
                target_means = df.groupby(col)['salary'].mean()
                global_mean = df['salary'].mean()
                target_encoder[col] = target_means.to_dict()
                df[col + '_target_encoded'] = df[col].map(target_encoder[col]) * 0.7 + global_mean * 0.3
            else:
                logging.warning(f"Column {col} not found for target encoding.")
    else:
        if target_encoder is None:
            raise ValueError("Target encoder must be provided for inference.")
        for col in categorical_target:
            if col in df.columns:
                # Use global_mean_salary for inference if provided, else default to 0
                fill_value = global_mean_salary if global_mean_salary is not None else 0
                df[col + '_target_encoded'] = df[col].map(target_encoder[col]).fillna(fill_value)
            else:
                logging.warning(f"Column {col} not found for target encoding.")

    # Update numerical features with target-encoded columns
    numerical_features.extend([col + '_target_encoded' for col in categorical_target if col in df.columns])

    # Drop original categorical columns after target encoding
    columns_to_drop = [col for col in categorical_target if col in df.columns]
    df = df.drop(columns=columns_to_drop)

    # One-Hot Encoding for remaining categorical features
    valid_onehot_cols = [col for col in categorical_onehot if col in df.columns]
    df_encoded = pd.get_dummies(df, columns=valid_onehot_cols)

    # Ensure column consistency for inference
    if reference_columns:
        missing_cols = set(reference_columns) - set(df_encoded.columns)
        for col in missing_cols:
            df_encoded[col] = 0
        df_encoded = df_encoded[reference_columns]

    # Standard Scaling
    valid_numerical_cols = [col for col in numerical_features if col in df_encoded.columns]
    if training:
        scaler = StandardScaler()
        df_encoded[valid_numerical_cols] = scaler.fit_transform(df_encoded[valid_numerical_cols])
    else:
        if scaler is None:
            raise ValueError("Scaler must be provided for inference.")
        df_encoded[valid_numerical_cols] = scaler.transform(df_encoded[valid_numerical_cols])

    # Debug: Check for non-numeric columns
    non_numeric_cols = df_encoded.select_dtypes(include=['object']).columns
    if len(non_numeric_cols) > 0:
        raise ValueError(f"Non-numeric columns found in final DataFrame: {non_numeric_cols.tolist()}")

    return df_encoded, scaler, target_encoder

In [191]:
df_encoded, scale, target_encoders= preprocess_data(combined_data,training=True)

In [192]:
import joblib

In [193]:
joblib.dump(target_encoders, "target_encoder.joblib")

['target_encoder.joblib']

In [199]:
import pandas as pd

# Список стовпців для аналізу
columns = [
    'projectDomain', 'mainSpecialization', 'mainPosition', 'currentLocation',
    'englishProficiency', 'companySizeUA', 'gender', 'companyMainArea',
    'currentRegion', 'employmentType', 'educationLevel', 'jobTitle','age', 'experience'
]

# Унікальний список стовпців (оскільки mainPosition повторюється)
unique_columns = list(dict.fromkeys(columns))

# Виведення таблиць для кожного стовпця
for col in unique_columns:
    # Отримуємо value_counts як DataFrame
    counts = combined_data[col].value_counts().reset_index()
    counts.columns = ['Value', 'Count']

    # Додаємо заголовок
    print(f"\n=== {col} ===")

    # Виводимо таблицю
    display(counts)  # display() створює гарну таблицю в Colab
    print("\n" + "-"*60 + "\n")


=== projectDomain ===


Unnamed: 0,Value,Count
0,E-commerce,2860
1,Fintech / Banking / Capital Management,2658
2,Medtech / Healthcare,2350
3,Other,1848
4,"Big Data, Data Science, Gambling",1127
5,"Blockchain, Telecom",772
6,"Internet of Things, Security",708
7,GameDev,640
8,"Geospatial, Mobile, Інша",611
9,Retail,527



------------------------------------------------------------


=== mainSpecialization ===


Unnamed: 0,Value,Count
0,Front-end Development,13022
1,Quality Assurance,2716
2,Data & BI,1215
3,DevOps & IT Ops,816
4,Other,596
5,Embedded & Hardware,38
6,Game Development,17
7,Cybersecurity,9
8,AI / ML,6
9,Product & Project Management,6



------------------------------------------------------------


=== mainPosition ===


Unnamed: 0,Value,Count
0,Hardware Engineer,8819
1,Software Engineer / Programmer,4512
2,"QA / AQA / QC Engineer (Junior, Middle, Senior...",2762
3,"Data Science, Machine Learning, AI, Big Data, ...",593
4,"Analyst (Business, Data, System etc)",492
5,"DevOps, SRE",446
6,Other,387
7,"DevOps, SRE, Operations",264
8,Project/Product/Program/Delivery/Engineering M...,183



------------------------------------------------------------


=== currentLocation ===


Unnamed: 0,Value,Count
0,In Ukraine,17325
1,Abroad (plan to return),1133



------------------------------------------------------------


=== englishProficiency ===


Unnamed: 0,Value,Count
0,Upper-Intermediate,8103
1,Intermediate,6651
2,Pre-Intermediate,1871
3,Advanced,1542
4,Elementary,278
5,No Proficiency,13



------------------------------------------------------------


=== companySizeUA ===


Unnamed: 0,Value,Count
0,Up to 200,4785
1,Over 1000,4525
2,Up to 50,3820
3,Up to 1000,3326
4,Up to 10 specialists,1776
5,Only me / Freelance,226



------------------------------------------------------------


=== gender ===


Unnamed: 0,Value,Count
0,Male,15917
1,Female,2541



------------------------------------------------------------


=== companyMainArea ===


Unnamed: 0,Value,Count
0,Outsourcing,7732
1,Product,6992
2,Outstaffing,2595
3,Startup,697
4,Other,205
5,Government Entity,126
6,Freelance,111



------------------------------------------------------------


=== currentRegion ===


Unnamed: 0,Value,Count
0,Central Ukraine,8897
1,Western Ukraine,5342
2,Eastern Ukraine,2177
3,Southern Ukraine,813
4,Other,96



------------------------------------------------------------


=== employmentType ===


Unnamed: 0,Value,Count
0,Full-time IT employee,17096
1,Unemployed (looking for IT job),589
2,On bench (paid/unpaid),325
3,Part-time IT employee,190
4,Freelancer in IT,143
5,Not working (temporary),81
6,Military service (ex-IT),34



------------------------------------------------------------


=== educationLevel ===


Unnamed: 0,Value,Count
0,"Higher Education (Bachelor, Specialist, Master)",15755
1,Current University Student,946
2,Incomplete Higher Education,832
3,Secondary Specialized Education,355
4,Secondary Education,302
5,PhD or Doctorate,268



------------------------------------------------------------


=== jobTitle ===


Unnamed: 0,Value,Count
0,Middle,7135
1,Senior,6239
2,Junior,3563
3,Lead/Manager,1232
4,No Title,289



------------------------------------------------------------


=== age ===


Unnamed: 0,Value,Count
0,27.0,1111
1,25.0,1106
2,26.0,1060
3,28.0,1058
4,29.0,1054
5,30.0,1041
6,23.0,1019
7,33.0,947
8,24.0,946
9,32.0,926



------------------------------------------------------------


=== experience ===


Unnamed: 0,Value,Count
0,3,2832
1,2,2471
2,5,2117
3,4,2103
4,1,1790
5,6,1559
6,7,1179
7,0,927
8,8,849
9,10,727



------------------------------------------------------------



In [198]:
for col in ['age', 'experience']:
    print(f"\n=== {col} (описова статистика) ===")
    display(combined_data[col].describe())
    print("\n" + "-"*60 + "\n")


=== age (описова статистика) ===


Unnamed: 0,age
count,18458.0
mean,29.850742
std,6.425495
min,16.0
25%,25.0
50%,29.0
75%,34.0
max,63.0



------------------------------------------------------------


=== experience (описова статистика) ===


Unnamed: 0,experience
count,18458.0
mean,4.83281
std,3.522511
min,0.0
25%,2.0
50%,4.0
75%,7.0
max,15.0



------------------------------------------------------------



#Saving

In [195]:
# Save the DataFrame to a CSV file
df_encoded.to_csv('df_encoded.csv', index=False)

In [196]:
import pickle
with open('scale.pkl', 'wb') as f:
    pickle.dump(scale, f)

In [197]:
joblib.dump(target_encoders, '/content/drive/MyDrive/diploma/target_encoder.joblib')

['/content/drive/MyDrive/diploma/target_encoder.joblib']