In [1]:
import argparse
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import re
import ast

import os


  from tqdm.autonotebook import tqdm, trange


In [2]:


vrf_data_path = "TOY_DATA/TOY_vrf_data2024.xlsx"
seva_data_path = "TOY_DATA/TOY_seva_modified20apr.xlsx"
linkedin_data_path = "TOY_DATA/TOY_linkedin_data.xlsx"
degree_to_skill_mapping_path = "TOY_DATA/degree_profession_mapping.json"
job_clusters_latest = "TOY_DATA/JOB_CLUSTERS_LATEST.txt"


vrf_data_df = pd.read_excel(vrf_data_path)
og_seva_df = pd.read_excel(seva_data_path)
linkedin_df = pd.read_excel(linkedin_data_path)

# Optional degree to skill mapping
degree_profession_mapping = {}
if degree_to_skill_mapping_path:
    with open(degree_to_skill_mapping_path, "r") as json_file: # "degree_profession_mapping.json"
        degree_profession_mapping = json.load(json_file)


# Function to read and convert JOB_CLUSTERS_LATEST from a text file
def read_job_clusters_latest(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    return content

# Reading and processing the JOB_CLUSTERS_LATEST file
JOB_CLUSTERS_LATEST = read_job_clusters_latest(job_clusters_latest)

# Combine and process data
master_sheet_df = og_seva_df.copy()
master_sheet_df['SKILLS_ALL'] = og_seva_df['Any Additional Skills'].fillna(' ') + ' ' + og_seva_df['Computer Skills'].fillna(' ') + ' ' + og_seva_df['Skills'].fillna(' ') + ' ' + og_seva_df['Skills.1'].fillna(' ')
master_sheet_df['WORK_EXPERIENCE_ALL'] = np.NaN
for work_col in ['Work Experience/Company', 'Work Experience/Designation', 'Work Experience/Tasks', 'Work Experience/Industry', 'Work Experience/From Date', 'Work Experience/To Date']:
    master_sheet_df['WORK_EXPERIENCE_ALL'] = master_sheet_df['WORK_EXPERIENCE_ALL'].fillna(' ') + ' ' + master_sheet_df[work_col].fillna(' ')
master_sheet_df['WORK_EXPERIENCE_ALL'] = master_sheet_df['WORK_EXPERIENCE_ALL'].apply(lambda s: s.strip())


master_sheet_df['HOBBIES_ALL'] = np.NaN
for hobbies_col in ['Any Hobbies/Interests', 'Hobbies/Interests/Type', 'Hobbies/Interests/Name']:
    master_sheet_df['HOBBIES_ALL'] = master_sheet_df['HOBBIES_ALL'].fillna(' ') + ' ' + master_sheet_df[hobbies_col].fillna(' ')

master_sheet_df['CONCERNS_ALL'] = master_sheet_df['Concerns'].fillna(' ') + ' ' + master_sheet_df['Please enter any concerns here'].fillna(' ')

master_sheet_df['LANGUAGES_ALL'] = np.NaN

for lang_col in ['Languages',
'Languages/Can read',
'Languages/Can speak',
'Languages/Can type',
'Languages/Can write']:
    master_sheet_df['LANGUAGES_ALL'] = master_sheet_df['LANGUAGES_ALL'].fillna(' ') + ' ' + master_sheet_df[lang_col].fillna(' ')

master_sheet_df['LANGUAGES_ALL'] = master_sheet_df['LANGUAGES_ALL'].apply(lambda s : s.strip())

master_sheet_df.rename(columns={"Concerns":"CONCERNS_ALL", "Gender":"GENDER_ALL", "Age":"AGE_ALL"}, inplace=True)


linkedin_df.rename(columns={'SP ID':'SP_ID'}, inplace=True)

linkedin_df = linkedin_df.add_suffix('_LINKEDIN')
master_sheet_df = master_sheet_df.add_suffix('_APPLICATIONFORM')

linkedin_df.rename(columns={'SP_ID_LINKEDIN':'SP_ID'}, inplace=True)
master_sheet_df.rename(columns={'SP ID_APPLICATIONFORM':'SP_ID'}, inplace=True)

# new_df = pd.read_excel('../data/skill_comparison_output_ALL_ROWS_phase2_v1.xlsx')


# List of unwanted skills
unwanted_skills = [
    'Basic Computer Skills / Basic Computer (MS Office and Email) Skills',
    'Soft Skills / Fit for Physical Seva',
    'Soft Skills / Soft-spoken and cordial',
    'Soft Skills / Enthusiastic',
    'Soft Skills / Articulate in communication'
]

# Split skills on '\n' and reset index to preserve 'SP_ID'
skills_df = og_seva_df[['SP ID', 'Skills']].assign(Skills=og_seva_df['Skills'].str.split('\n')).explode('Skills').reset_index(drop=True)

# Filter unwanted skills
filtered_skills = skills_df[~skills_df['Skills'].isin(unwanted_skills)].reset_index(drop=True)

# Concatenate remaining skills with comma
cleaned_skills = filtered_skills.groupby('SP ID')['Skills'].agg(', '.join).reset_index(name='Cleaned_Skills')

# Merge with master_sheet_df on 'SP_ID'
result_df = master_sheet_df.merge(cleaned_skills, left_on='SP_ID', right_on='SP ID', how='left')

result_df.drop(columns=['SP ID'], inplace=True)

master_sheet_df = result_df.rename(columns={'Cleaned_Skills':'INTERVIEWER_SKILLS'})

In [3]:
master_sheet_df

Unnamed: 0,SP_ID,Registration Batch _APPLICATIONFORM,GENDER_ALL_APPLICATIONFORM,AGE_ALL_APPLICATIONFORM,Seva Dept_APPLICATIONFORM,City_APPLICATIONFORM,State_APPLICATIONFORM,Nationality_APPLICATIONFORM,Country_APPLICATIONFORM,Work Experience/Company_APPLICATIONFORM,...,Any Hobbies/Interests_APPLICATIONFORM,Hobbies/Interests/Type_APPLICATIONFORM,Hobbies/Interests/Name_APPLICATIONFORM,Isha Connect/Name_APPLICATIONFORM,SKILLS_ALL_APPLICATIONFORM,WORK_EXPERIENCE_ALL_APPLICATIONFORM,HOBBIES_ALL_APPLICATIONFORM,CONCERNS_ALL_APPLICATIONFORM,LANGUAGES_ALL_APPLICATIONFORM,INTERVIEWER_SKILLS
0,577769,2024-2025,male,34,,Bengaluru,Karnataka,India,India,Ripplr Storeking,...,,,,IE & some Isha volunteering,"MS-word,Excel, powerpoint Soft Skills / Arti...",Ripplr Storeking Logistic Mananger Senior exec...,,none,Kannada (R W S T) English (UK) (R W S T) Hindi...,Soft Skills / Articulate in communication Basi...
1,579349,2024-2026,female,27,,Panchkula,Haryana,India,India,Teleperformance,...,,,,IE & some Isha volunteering,Na Soft Skills / Articulate in communication...,Teleperformance SME BPO and KPO 2021-10-25 N...,,none,English (US) (R W S T) Medium Medium Medium In...,Soft Skills / Articulate in communication Basi...
2,576830,2024-2027,male,23,,Bellary District,Karnataka,India,India,Ashirwad Construction Company Ashirwad Constru...,...,,,,IE & some Isha volunteering IE ++ Prospective FTV,Has also learnt music production but not worke...,Ashirwad Construction Company Ashirwad Constru...,,none,Kannada (R W S ) English (US) (R W S T) Fluent...,Soft Skills / Articulate in communication Basi...
3,577412,2024-2028,female,26,,Warangal,Telangana,India,India,"ICF International, Global Consultancy Services...",...,"She has her own insta channel, expressed her i...",,,IE ++,https://www.instagram.com/yogic.musings?utm_so...,"ICF International, Global Consultancy Services...","She has her own insta channel, expressed her...",none,Telugu / తెలుగు (R W S T) Hindi / हिंदी (R W S...,Engineering / Electrical Engineer
4,577678,2024-2029,male,35,,Aurangabad,Maharashtra,India,India,CSMSS CHH. SHAHU COLLEGE OF ENGINEERING,...,,,,IE ++,"data science, microsoft excel, artificial in...",CSMSS CHH. SHAHU COLLEGE OF ENGINEERING Assist...,,none,Marathi (R W S ) Hindi / हिंदी (R W S ) Englis...,Education / Teacher - Others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669,574736,2024-2694,male,25,,Bengaluru,Karnataka,India,India,Gostol TST Limited Ganesh Power Technologies,...,compose kannada music and poetry.,Communication,Poetry,IE & some Isha volunteering,I Have Basic Knowledge of Computer Soft Skil...,Gostol TST Limited Ganesh Power Technologies S...,compose kannada music and poetry. Communicat...,minor smoking and alcohol stopped a year befor...,Kannada (R W S ) English (UK) (R W S T) Hindi ...,Soft Skills / Articulate in communication Basi...
670,574706,2024-2695,male,27,,Bengaluru,Karnataka,India,India,upgrad phablecare,...,,,,IE & some Isha volunteering,"Tools - leadsquare, amio, salesforce (Sales To...",upgrad phablecare BDM Key account manager Reve...,,none,English (US) (R W S T) Kannada (R W S T) Hindi...,Soft Skills / Articulate in communication Basi...
671,574685,2024-2696,male,20,,Karad,Maharashtra,India,India,,...,Trekking\nFootball\nCricket,,,Only IE IE & some Isha volunteering Fresher,No Environment / Agriculture/ Farmer Basic C...,NaT NaT NaT NaT NaT NaT NaT NaT NaT NaT NaT Na...,Trekking\nFootball\nCricket,none,Marathi (R W S T) English (UK) (R W S T) Fluen...,Environment / Agriculture/ Farmer Basic Comput...
672,574641,2024-2697,male,31,,Vasco Da Gama,Goa,India,India,Indusind Bank Ltd ICICI BANK LTD,...,,,,Only IE No Isha volunteering,MS office Environment / Agriculture/ Farmer ...,Indusind Bank Ltd ICICI BANK LTD Relationship ...,,minor He used to smoke and drink for 7-8 years...,English (UK) (R W S T) Fluent Fluent Fast Fluent,Environment / Agriculture/ Farmer Soft Skills ...


In [4]:
# Function to map degrees to professions
degrees_not_mapped = [ ]
def map_degrees_to_professions(degrees):
    skills_list = []
    for degree in degrees:
        if degree in degree_profession_mapping:
            skills_list.extend(degree_profession_mapping[degree])
        else:
            degrees_not_mapped.append('Degree Not Found: ' + degree)
    return ', '.join(skills_list)


# Apply the mapping function to create the 'EDUCATION_SKILLS' column
og_seva_df['EDUCATION_SKILLS'] = og_seva_df['Education/Qualifications'].apply(
    lambda x: map_degrees_to_professions(str(x).split('\n')) if not pd.isna(x) else ''
)


merged_df = pd.merge(master_sheet_df, og_seva_df[['SP ID', 'EDUCATION_SKILLS']], left_on='SP_ID', right_on='SP ID', how='inner')


master_sheet_df = merged_df.drop(columns='SP ID')


# ### Add DESIGNATION_SKILLS


# Step 0: Identify and handle duplicates and NaN values in 'Work Experience/Designation' column
og_seva_df['Work Experience/Designation'] = og_seva_df['Work Experience/Designation'].apply(lambda x: x.split('\n') if isinstance(x, str) else x)
og_seva_df_exploded = og_seva_df.explode('Work Experience/Designation')
og_seva_df_exploded.dropna(subset=['Work Experience/Designation'], inplace=True)
og_seva_df_exploded.drop_duplicates(subset=['SP ID', 'Work Experience/Designation'], inplace=True)

# Step 1: Group by 'SP_ID' and join the designations using commas
og_seva_df_grouped = og_seva_df_exploded.groupby('SP ID')['Work Experience/Designation'].agg(lambda x: ', '.join(str(v) for v in x)).reset_index()

# Check for duplicate 'SP_ID' values in og_seva_df_grouped
duplicates_og_seva = og_seva_df_grouped[og_seva_df_grouped.duplicated('SP ID')]
if not duplicates_og_seva.empty:
    raise ValueError(f'Duplicate SP_ID values found in og_seva_df_grouped: {duplicates_og_seva}')

# Step 2: Reset index in 'master_sheet_df' if needed
master_sheet_df_reset = master_sheet_df.reset_index()

# Check for duplicate 'SP_ID' values in master_sheet_df_reset
duplicates_master_sheet = master_sheet_df_reset[master_sheet_df_reset.duplicated('SP_ID')]
if not duplicates_master_sheet.empty:
    raise ValueError(f'Duplicate SP ID values found in master_sheet_df_reset: {duplicates_master_sheet}')

# Step 3: Merge with 'master_sheet_df_reset' on 'SP_ID'
merged_df = pd.merge(master_sheet_df_reset, og_seva_df_grouped, left_on='SP_ID', right_on='SP ID', how='left')

# Now 'merged_df' contains the desired result with designations separated by commas


merged_df.rename(columns={'Work Experience/Designation':'DESIGNATION_SKILLS'}, inplace=True)


merged_df.drop('SP ID', axis=1, inplace=True)


master_sheet_df = merged_df

In [5]:

## RE-RUNNING SKILL COMPARISON

df_vrf = vrf_data_df

df_vrf = df_vrf[['Department', '/']].rename(columns = {'/':'Job Title'})


In [6]:
def concatenate_skills(row):
    skills_list = []

    # INTERVIEWER_SKILLS
    if not pd.isna(row['INTERVIEWER_SKILLS']) and str(row['INTERVIEWER_SKILLS']).strip() != '':
        skills_list.append(str(row['INTERVIEWER_SKILLS']).strip(', '))

    # EDUCATION_SKILLS
    if not pd.isna(row['EDUCATION_SKILLS']) and str(row['EDUCATION_SKILLS']).strip() != '':
        skills_list.append(str(row['EDUCATION_SKILLS']).strip(', '))

    # DESIGNATION_SKILLS
    if not pd.isna(row['DESIGNATION_SKILLS']) and str(row['DESIGNATION_SKILLS']).strip() != '':
        skills_list.append(str(row['DESIGNATION_SKILLS']).strip(', '))

    # Combine non-empty skills using ', '
    combined_skills = ', '.join(skills_list)

    return combined_skills


# Apply the function to each row
master_sheet_df['COMBINED_SKILLS'] = master_sheet_df.apply(concatenate_skills, axis=1)

# Preprocess the data
VRF_job_titles = df_vrf['Job Title'].tolist()

# Preprocess the data
df_vrf_cleaned = df_vrf.dropna(subset=['Job Title'])
VRF_job_titles = df_vrf_cleaned['Job Title'].tolist()

# Replace NaN values with empty strings
updated_VRF_job_titles = ["" if pd.isna(title) else title for title in VRF_job_titles]
VRF_job_titles = updated_VRF_job_titles

In [7]:
VRF_job_titles

['Arabic Translator',
 'Translators',
 'French Translator',
 'Dining Support',
 'Administrative Activities (Back Office)',
 'Recruitment Officer',
 'Site Supervisor',
 'Coordinating Activities',
 'German Translator',
 'Translators',
 'Translators',
 'Indonesian Translator',
 'Maintenance Support',
 'Italian Translator',
 'Japanese Translator',
 'Korean Video Editor',
 'Supervising Activities',
 'Front Office Activities',
 'Call Centre',
 'Mandarin Chinese Translator',
 'Persian Translator',
 'Polish Translator',
 'Portuguese Translator',
 'Romanian Video Editor',
 'Russian Video Editor',
 'Accountant',
 'Maintenance Support',
 'Translators',
 'Spanish Translator',
 'Thai Translator',
 'Guiding Visitors',
 'Customer Support Representatives',
 'Front Office Activities',
 'Sales Manager/ Executive',
 'Content Manager',
 'Content Support Executive (CSE)',
 'Content Writer',
 'Social media Manager',
 'Marketing Manager/ Executive',
 'Administrative Activities (Back Office)',
 'On ground Sup

In [8]:
master_sheet_df['COMBINED_SKILLS'].iloc[22]


'Basic Computer Skills / Basic Computer (MS Office and Email) Skills Others / Cab Driver General / Retail, Accountant, Financial Analyst, Driver Assistant branch manager'

In [9]:

model = SentenceTransformer('sentence-transformers/all-roberta-large-v1', device='cpu')

print('EMBEDDING SKILLS...')

job_title_corr_mat = np.zeros((len(master_sheet_df), len(VRF_job_titles)))
embeddings_master = model.encode(master_sheet_df['COMBINED_SKILLS'], convert_to_tensor=True, show_progress_bar=True, device='cpu')

print('EMBEDDING JOB TITLES...')


embeddings_vrf = model.encode(VRF_job_titles, convert_to_tensor=True, show_progress_bar=True, device='cpu')

print('COMPUTING COSINE SIMILARITY SCORES...')

# # Compute cosine similarity scores
cosine_scores = util.cos_sim(embeddings_master, embeddings_vrf)

# Create a list to store the results

print('SCORING AND MATCHING SKILLS TO JOB TITLES...')

results = []

# Iterate through SP_IDs and df_vrf rows to calculate skill scores
for i, sp_id in enumerate(master_sheet_df['SP_ID']):
    for (j, row) in df_vrf.dropna(subset=['Job Title']).reset_index(drop=True).iterrows():
        department = row['Department']
        job_title = row['Job Title']

        # Ensure that the indices are within bounds
        if i < len(cosine_scores) and j < len(cosine_scores[i]):
            # Extract skills for the SP_ID (as per your previous code)
            participant_skills = master_sheet_df[master_sheet_df['SP_ID'] == sp_id]['COMBINED_SKILLS'].iloc[0]

            # Calculate the skill score
            raw_skill_score = cosine_scores[i, j]
            skill_score = 0 if pd.isna(raw_skill_score) else raw_skill_score

            # Store the results including SP_ID, row number, department, job title, and skill score
            results.append({
                'SP_ID': sp_id,
                'Row Number': j,
                'Department': department,
                'Job Title': job_title,
                'Skill Score': skill_score
            })
        else:
            print(f"Warning: Index out of bounds - i={i}, j={j}")

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

results_df['Skill Score'] = results_df['Skill Score'].apply(lambda x: x.numpy())

results_df_department_not_null = results_df[~results_df['Department'].isna()]

results_df_department_not_null = results_df_department_not_null.groupby(['SP_ID', 'Department', 'Job Title', 'Skill Score']).size().reset_index(name="Count")

results_df_department_not_null.drop(columns=['Count'], inplace=True)

# Grouping by 'SP_ID' and sorting each group by 'Skill Score' in descending order
results_df_department_not_null = results_df_department_not_null.groupby('SP_ID', group_keys=False).apply(lambda group: group.sort_values('Skill Score', ascending=False))

# Resetting the index after sorting
results_df_department_not_null.reset_index(drop=True, inplace=True)

def invert_job_clusters(job_clusters):
    optimized_dict = {}
    for main_category, subcategories in job_clusters.items():
        for subcategory, titles in subcategories.items():
            for title in titles:
                optimized_dict[title] = (main_category, subcategory)
    return optimized_dict

def get_category_optimized(job_title, optimized_dict):
    return optimized_dict.get(job_title, ("Not Found", "Not Found"))

# Function to convert a pretty-printed tree to a regular dictionary
def pretty_tree_to_regular(pretty_str):
    tree = {}
    lines = pretty_str.strip().split('\n')

    # Regular expressions to identify levels based on indentation
    pattern = re.compile(r'^( *)(- )?(.*)')

    # Helper function to insert an item into the tree based on indentation levels
    def insert_into_tree(tree, levels, item):
        for level in levels[:-1]:
            tree = tree.setdefault(level, {})
        tree.setdefault(levels[-1], []).append(item)

    # Stack to keep track of the current hierarchy based on indentation
    hierarchy_stack = []
    for line in lines:
        indent, _, item = pattern.match(line).groups()
        level = len(indent) // 2  # Assuming two spaces per indent level

        # Adjust the current item by removing colon if we're at the first or second level
        if level <= 1 and item.endswith(':'):
            item = item[:-1]

        # If we're at a deeper level, just append to the stack
        if level > len(hierarchy_stack):
            hierarchy_stack.append(item)
        else:
            # If we're at a shallower level, reset the stack to that level
            hierarchy_stack = hierarchy_stack[:level]
            hierarchy_stack.append(item)

        # If it's a job title (prefixed with '- '), insert it into the tree
        if _:
            insert_into_tree(tree, hierarchy_stack[:-1], hierarchy_stack[-1])
            hierarchy_stack.pop()  # Remove the job title from the hierarchy stack

    return tree

# # Example usage:
# pretty_str = """
# Creative and Media:
#   Music and Audio:
#     - Music Producer / Arranger
#     - Mixing Engineer
#   Content Creation and Writing:
#     - Content Writer
# Information Technology and Software:
#   Development and Programming:
#     - Software Developer
# """

TREE_FORMAT_JOB_CLUSTERS_LATEST = pretty_tree_to_regular(JOB_CLUSTERS_LATEST)
OG_FORMAT_JOB_CLUSTERS_LATEST = invert_job_clusters(TREE_FORMAT_JOB_CLUSTERS_LATEST)

results_df_department_not_null['Predicted Cluster'] = results_df_department_not_null['Job Title'].apply(lambda x: ' - '.join(OG_FORMAT_JOB_CLUSTERS_LATEST.get(x, ("Not Found", "Not Found"))))

SKILLS_COLS = ['SP_ID', 'INTERVIEWER_SKILLS', 'EDUCATION_SKILLS', 'COMBINED_SKILLS']

SUMMARY_COLS = ['SP ID',
    'Education/Qualifications','Education/Institution\'s Name', 'Education/City', 'Education/Specialization', 'Education/Year of Passing/Graduation', 
    'Work Experience/Designation', 'Work Experience/Tasks', 'Work Experience/Industry', 'Work Experience/From Date', 'Work Experience/To Date',
    'Interviewer Feedback/Answer']

SUMMARY_TABLE = og_seva_df[SUMMARY_COLS]

predictions_data_path = "./TOY_DATA/predictions_data.xlsx"
extracted_skills_data_path = "./TOY_DATA/extracted_skills_data.xlsx"
summary_data_path = "./TOY_DATA/summary_data.xlsx"

# Save output data
# (Save your processed data frames to files as specified in the function arguments)
# Example:
print('SAVING OUTPUT...')
results_df_department_not_null.to_excel(predictions_data_path)
master_sheet_df[SKILLS_COLS].to_excel(extracted_skills_data_path)
SUMMARY_TABLE.to_excel(summary_data_path)



EMBEDDING SKILLS...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

EMBEDDING JOB TITLES...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

COMPUTING COSINE SIMILARITY SCORES...
SCORING AND MATCHING SKILLS TO JOB TITLES...


  results_df_department_not_null = results_df_department_not_null.groupby('SP_ID', group_keys=False).apply(lambda group: group.sort_values('Skill Score', ascending=False))


SAVING OUTPUT...


In [10]:

TREE_FORMAT_JOB_CLUSTERS_LATEST


{'\ufeffCreative and Media': {'Music and Audio': ['Music Producer / Arranger',
   'Mixing Engineer',
   'Singers / Instrumentalists / Musicians',
   'Musician ( Singer and Drummer )',
   'Music/Mridhamgam/violin/Tabla/Dance',
   'Carnatic Music Teacher',
   'Mridangam Teacher',
   'Piano teacher',
   'Tabla teacher',
   'Dubbing artists',
   'German Dubbing Artist',
   'Russian Dubbing Artist',
   'Mandarin Chinese Dubbing Artist',
   'Japanese Dubbing Artist',
   'Arabic Dubbing Artist',
   'Vietnamese Dubbing Artist',
   'Persian Dubbing Artist',
   'Thai Dubbing Artist',
   'Portuguese dubbing artist',
   'Dubbing Artist',
   'MSR - Audio, Cottage',
   'Male Dubbing Voice'],
  'Visual Arts': ['Photographer',
   'Photo Editor / Photographer',
   'Artist/ Illustrator',
   'Artificial Intelligence (AI) Art Enthusiast',
   'Art Teacher',
   'Aesthetic/Artist/Illustrator',
   'Product/Furniturer/Interior Designer',
   'Crafts/origami teacher',
   'Crafts Teacher',
   'Teach fashion desig

In [11]:
OG_FORMAT_JOB_CLUSTERS_LATEST

{'Music Producer / Arranger': ('\ufeffCreative and Media', 'Music and Audio'),
 'Mixing Engineer': ('\ufeffCreative and Media', 'Music and Audio'),
 'Singers / Instrumentalists / Musicians': ('\ufeffCreative and Media',
  'Music and Audio'),
 'Musician ( Singer and Drummer )': ('\ufeffCreative and Media',
  'Music and Audio'),
 'Music/Mridhamgam/violin/Tabla/Dance': ('\ufeffCreative and Media',
  'Music and Audio'),
 'Carnatic Music Teacher': ('\ufeffCreative and Media', 'Music and Audio'),
 'Mridangam Teacher': ('\ufeffCreative and Media', 'Music and Audio'),
 'Piano teacher': ('\ufeffCreative and Media', 'Music and Audio'),
 'Tabla teacher': ('\ufeffCreative and Media', 'Music and Audio'),
 'Dubbing artists': ('\ufeffCreative and Media', 'Music and Audio'),
 'German Dubbing Artist': ('\ufeffCreative and Media', 'Music and Audio'),
 'Russian Dubbing Artist': ('\ufeffCreative and Media', 'Music and Audio'),
 'Mandarin Chinese Dubbing Artist': ('\ufeffCreative and Media',
  'Music and A

In [12]:
def process_file(file_path):
    # Load the Excel file
    df = pd.read_excel(file_path)
    
    # Sort the DataFrame by SP_ID if it's not already sorted
    df = df.sort_values(by='SP_ID')
    
    # Add a 'Row Number' column, which is the row number within each SP_ID group
    df['Row Number'] = df.groupby('SP_ID').cumcount() + 1


    ### Only extra added code is below ###
    ######################################
    # Create rank
    # Apply ranking within each customer_id based on order_score in descending order
    df['rank'] = df.groupby('SP_ID')['Skill Score'].rank(method='first', ascending=False)
    # Sort the DataFrame by customer_id and rank in ascending order
    df = df.sort_values(by=['SP_ID', 'rank'])
    ######################################


    # Split 'Predicted Cluster' into 'General Cluster' and 'Specific Cluster'
    df[['General Cluster', 'Specific Cluster']] = df['Predicted Cluster'].str.split(' - ', expand=True)

    # Generate the output file path by adding _PROCESSED before the file extension
    file_name, file_extension = os.path.splitext(file_path)
    output_file_path = f"{file_name}_PROCESSED{file_extension}"

    # Save the updated DataFrame back to a new Excel file with _PROCESSED suffix
    df.to_excel(output_file_path, index=False)
    print(f'The Excel file has been updated and saved as "{output_file_path}".')


process_file(predictions_data_path)

The Excel file has been updated and saved as "./TOY_DATA/predictions_data_PROCESSED.xlsx".
