In [2]:
import pandas as pd

In [2]:
file_path_jsonl = r'data\combined_data\profiles_combined.json'

# Read the JSONL file
df = pd.read_json(file_path_jsonl)

# Normalise dataframe

In [3]:
import numpy as np

df['experience'] = df['experience'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

# Create empty lists to store extracted data
profile_names = []
organization_profiles = []
positions = []
organisation =[]
start_times = []
end_times = []
durations = []

for idx, row in df.iterrows():
    name = row['profile']
    experience_data = row['experience']
    
    for exp in experience_data:
        profile_names.append(name)
        organization_profiles.append(exp.get("organisation_profile", ""))
        positions.append(exp.get("position", ""))
        organisation.append(exp.get("organisation", ""))
        start_times.append(exp.get("start_time", ""))
        end_times.append(exp.get("end_time", ""))
        durations.append(exp.get("duration", ""))

extracted_data_df = pd.DataFrame({
    'profile_name': profile_names,
    'organization_profile': organization_profiles,
    'position': positions,
    'organisation':organisation,
    'start_time': start_times,
    'end_time': end_times,
    'duration': durations
})
extracted_data_df

Unnamed: 0,profile_name,organization_profile,position,organisation,start_time,end_time,duration
0,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Manager - Data Analytics & Governance,MAS Holdings,Jan 2023,present,10 months
1,sagara-lakmal-3b5634103,https://www.linkedin.com/company/ndbbank,Associate Manager - Data Modelling Lead - Corp...,National Development Bank PLC (NDB),Jun 2021,Jan 2023,1 year 8 months
2,sagara-lakmal-3b5634103,https://www.linkedin.com/company/dialog-axiata...,Senior Executive - Dashboarding and Visualizat...,Dialog Axiata PLC,Oct 2019,Jun 2021,1 year 9 months
3,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Executive - Data Analytics,MAS Holdings,Apr 2018,Sep 2019,1 year 6 months
4,ganguli-wijewardana,https://www.linkedin.com/company/mas-kreeda,Assistant Manager - Operations and Systems Ana...,MAS KREEDA,Aug 2023,present,3 months
...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,https://lk.linkedin.com/company/hela-apparel-h...,Senior Merchandiser,Hela Apparel Holdings,Mar 2013,Mar 2014,1 year 1 month
1693,oshadha-dammearachchi-32aa49160,https://lk.linkedin.com/company/gaia-greenener...,Senior Executive - Business Analyst,Gaia Greenenergy Holdings,May 2023,present,8 months
1694,oshadha-dammearachchi-32aa49160,https://lk.linkedin.com/company/brandix,Executive - Finance Business Partner for ICT,Brandix,Aug 2022,Apr 2023,9 months
1695,oshadha-dammearachchi-32aa49160,https://lk.linkedin.com/company/tnlrn,News Anchor,TNL Radio Network (Pvt) Limited,Mar 2019,Aug 2022,3 years 6 months


# Merge companies

In [7]:
# Convert all MAS organisations to MAS Holdings
extracted_data_df.iloc[:, 3] = extracted_data_df.iloc[:, 3].str.lower()
mask = extracted_data_df['organisation'].str[:3] == 'mas'

# Update the 'organisation' column for the rows where the condition is true
extracted_data_df.loc[mask, 'organisation'] = 'mas holdings'

# List of child organizations and the parent organization
child_organizations = ['mas intimates', 'mas kreeda', 'mas active', 'linea aqua', 'mas Linea Aqua', 'bodyline', 'mas legato',
                       'silueta - technologies by mas', 'twinery - innovations by mas', 'noyon lanka pvt ltd', 'mas matrix',
                       'hellmann mas supply chain', 'silueta', 'twinery', 'noyon',
                      ]

parent_organization = 'mas holdings'

for child_org in child_organizations:
    extracted_data_df.loc[extracted_data_df['organisation'].str.contains(child_org, case=False, na=False), 'organisation'] = parent_organization

extracted_data_df.head(50)

Unnamed: 0,profile_name,organization_profile,position,organisation,start_time,end_time,duration
0,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Manager - Data Analytics & Governance,mas holdings,Jan 2023,present,10 months
1,sagara-lakmal-3b5634103,https://www.linkedin.com/company/ndbbank,Associate Manager - Data Modelling Lead - Corp...,national development bank plc (ndb),Jun 2021,Jan 2023,1 year 8 months
2,sagara-lakmal-3b5634103,https://www.linkedin.com/company/dialog-axiata...,Senior Executive - Dashboarding and Visualizat...,dialog axiata plc,Oct 2019,Jun 2021,1 year 9 months
3,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Executive - Data Analytics,mas holdings,Apr 2018,Sep 2019,1 year 6 months
4,ganguli-wijewardana,https://www.linkedin.com/company/mas-kreeda,Assistant Manager - Operations and Systems Ana...,mas holdings,Aug 2023,present,3 months
5,ganguli-wijewardana,https://www.linkedin.com/company/mas-kreeda,Senior Executive - Operations & Systems Analyst,mas holdings,Apr 2022,Aug 2023,1 year 5 months
6,ganguli-wijewardana,https://www.linkedin.com/company/mas-kreeda,Executive - Operations & Systems Analyst,mas holdings,Jan 2019,Mar 2022,3 years 3 months
7,ganguli-wijewardana,https://www.linkedin.com/company/mas-intimates,Intern,mas holdings,Sep 2017,Feb 2018,6 months
8,harsha-karunanayake-875636a8,,Process Improvement Consultant,mas holdings,Apr 2022,present,1 year 7 months
9,harsha-karunanayake-875636a8,,Senior Business Analyst,mas holdings,Apr 2021,Apr 2022,1 year 1 month


In [8]:
# Read company details data
file_path = r'data\filtered_companies_combined.json'

df2 = pd.read_json(file_path)
df2

Unnamed: 0,name,url,industry,size,headquaters,type,founded
0,Marks and Spencer,https://www.linkedin.com/company/marks-and-spe...,Retail,"10,001+ employees","London, London",Public Company,1884
1,Virtusa,https://www.linkedin.com/company/virtusa,IT Services and IT Consulting,"10,001+ employees","Southborough, MA",Privately Held,1996
2,KPMG US,https://www.linkedin.com/company/kpmg-us,Financial Services,"10,001+ employees","New York, NY",Partnership,
3,Parexel,https://www.linkedin.com/company/parexel,Pharmaceutical Manufacturing,"10,001+ employees","Durham, North Carolina",Privately Held,
4,"attune, a Rizing Company",https://www.linkedin.com/company/attune-rizing...,IT Services and IT Consulting,"501-1,000 employees","Burlington, MA",Privately Held,
...,...,...,...,...,...,...,...
266,Seeds Intimate Apparel India Pvt Ltd,https://in.linkedin.com/company/seeds-intimate...,,"1,001-5,000 employees","Visakhapatnam, Andhra Pradesh",Partnership,2009
267,Bogawantalawa Tea,https://lk.linkedin.com/company/bogawantalawa-tea,Food and Beverage Manufacturing,51-200 employees,"Ragama, Western",Public Company,1869
268,Sri Lanka Customs,https://lk.linkedin.com/company/srilankacustoms,International Trade and Development,"1,001-5,000 employees","Colombo, Western Province",Government Agency,1809
269,University of Jaffna,https://lk.linkedin.com/school/university-of-j...,Higher Education,"1,001-5,000 employees","Jaffna, Northern",Educational,1979


In [9]:
# Convert 'organisation' column to lowercase 
df2.iloc[:, 0] = df2.iloc[:, 0].str.lower()

# Merge the DataFrames based on the lowercase 'organisation' column 
merged_df = pd.merge(extracted_data_df, df2, left_on=extracted_data_df.columns[3], right_on=df2.columns[0], how='left')

merged_df

Unnamed: 0,profile_name,organization_profile,position,organisation,start_time,end_time,duration,name,url,industry,size,headquaters,type,founded
0,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Manager - Data Analytics & Governance,mas holdings,Jan 2023,present,10 months,mas holdings,https://www.linkedin.com/company/mas-holdings,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,1987
1,sagara-lakmal-3b5634103,https://www.linkedin.com/company/ndbbank,Associate Manager - Data Modelling Lead - Corp...,national development bank plc (ndb),Jun 2021,Jan 2023,1 year 8 months,national development bank plc (ndb),https://www.linkedin.com/company/ndbbank,Financial Services,"1,001-5,000 employees",Colombo 02,Public Company,1979
2,sagara-lakmal-3b5634103,https://www.linkedin.com/company/dialog-axiata...,Senior Executive - Dashboarding and Visualizat...,dialog axiata plc,Oct 2019,Jun 2021,1 year 9 months,dialog axiata plc,https://www.linkedin.com/company/dialog-axiata...,Telecommunications,"1,001-5,000 employees","Colombo 2, Western Province",Public Company,1993
3,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Executive - Data Analytics,mas holdings,Apr 2018,Sep 2019,1 year 6 months,mas holdings,https://www.linkedin.com/company/mas-holdings,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,1987
4,ganguli-wijewardana,https://www.linkedin.com/company/mas-kreeda,Assistant Manager - Operations and Systems Ana...,mas holdings,Aug 2023,present,3 months,mas holdings,https://www.linkedin.com/company/mas-holdings,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,1987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,https://lk.linkedin.com/company/hela-apparel-h...,Senior Merchandiser,hela apparel holdings,Mar 2013,Mar 2014,1 year 1 month,hela apparel holdings,https://www.linkedin.com/company/hela-apparel-...,Retail Apparel and Fashion,"10,001+ employees",Colombo,Public Company,1991
1693,oshadha-dammearachchi-32aa49160,https://lk.linkedin.com/company/gaia-greenener...,Senior Executive - Business Analyst,gaia greenenergy holdings,May 2023,present,8 months,gaia greenenergy holdings,https://lk.linkedin.com/company/gaia-greenener...,Solar Electric Power Generation,11-50 employees,Colombo,Privately Held,
1694,oshadha-dammearachchi-32aa49160,https://lk.linkedin.com/company/brandix,Executive - Finance Business Partner for ICT,brandix,Aug 2022,Apr 2023,9 months,brandix,https://www.linkedin.com/company/brandix,Retail Apparel and Fashion,"10,001+ employees",,Privately Held,2002
1695,oshadha-dammearachchi-32aa49160,https://lk.linkedin.com/company/tnlrn,News Anchor,tnl radio network (pvt) limited,Mar 2019,Aug 2022,3 years 6 months,tnl radio network (pvt) limited,https://lk.linkedin.com/company/tnlrn,Broadcast Media Production and Distribution,51-200 employees,"Colombo 03, Western Province",Privately Held,1993


In [22]:
# # Assign 0 if organisation is empty
# new_df['organisation'] = new_df['organisation'].apply(lambda x: 'unmapped' if x == '' else x)
# new_df

In [10]:
# Select necessary columns
selected_columns = ['profile_name', 'position', 'start_time', 'end_time', 'organisation','duration', 'industry', 'size', 'founded','headquaters']

# Create a new dataFrame with selected columns 
new_df = merged_df[selected_columns]
new_df

Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters
0,sagara-lakmal-3b5634103,Manager - Data Analytics & Governance,Jan 2023,present,mas holdings,10 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo"
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,Jun 2021,Jan 2023,national development bank plc (ndb),1 year 8 months,Financial Services,"1,001-5,000 employees",1979,Colombo 02
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,Oct 2019,Jun 2021,dialog axiata plc,1 year 9 months,Telecommunications,"1,001-5,000 employees",1993,"Colombo 2, Western Province"
3,sagara-lakmal-3b5634103,Executive - Data Analytics,Apr 2018,Sep 2019,mas holdings,1 year 6 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo"
4,ganguli-wijewardana,Assistant Manager - Operations and Systems Ana...,Aug 2023,present,mas holdings,3 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo"
...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,Mar 2013,Mar 2014,hela apparel holdings,1 year 1 month,Retail Apparel and Fashion,"10,001+ employees",1991,Colombo
1693,oshadha-dammearachchi-32aa49160,Senior Executive - Business Analyst,May 2023,present,gaia greenenergy holdings,8 months,Solar Electric Power Generation,11-50 employees,,Colombo
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,Aug 2022,Apr 2023,brandix,9 months,Retail Apparel and Fashion,"10,001+ employees",2002,
1695,oshadha-dammearachchi-32aa49160,News Anchor,Mar 2019,Aug 2022,tnl radio network (pvt) limited,3 years 6 months,Broadcast Media Production and Distribution,51-200 employees,1993,"Colombo 03, Western Province"


# Map Headquaters

In [11]:
# Cleaning headquaters (Removing numbers and strings after comma).
def clean_headquarters(value, part):
    if pd.notna(value):
        parts = value.split(',')
        if len(parts) > part:
            return ''.join(filter(str.isalpha, parts[part]))
    return value

new_df['headquaters_part1'] = new_df['headquaters'].apply(lambda x: clean_headquarters(x, 0))
new_df['headquaters_part2'] = new_df['headquaters'].apply(lambda x: clean_headquarters(x, 1))
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters_part1'] = new_df['headquaters'].apply(lambda x: clean_headquarters(x, 0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters_part2'] = new_df['headquaters'].apply(lambda x: clean_headquarters(x, 1))


Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters,headquaters_part1,headquaters_part2
0,sagara-lakmal-3b5634103,Manager - Data Analytics & Governance,Jan 2023,present,mas holdings,10 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Colombo,Colombo
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,Jun 2021,Jan 2023,national development bank plc (ndb),1 year 8 months,Financial Services,"1,001-5,000 employees",1979,Colombo 02,Colombo,Colombo 02
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,Oct 2019,Jun 2021,dialog axiata plc,1 year 9 months,Telecommunications,"1,001-5,000 employees",1993,"Colombo 2, Western Province",Colombo,WesternProvince
3,sagara-lakmal-3b5634103,Executive - Data Analytics,Apr 2018,Sep 2019,mas holdings,1 year 6 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Colombo,Colombo
4,ganguli-wijewardana,Assistant Manager - Operations and Systems Ana...,Aug 2023,present,mas holdings,3 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Colombo,Colombo
...,...,...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,Mar 2013,Mar 2014,hela apparel holdings,1 year 1 month,Retail Apparel and Fashion,"10,001+ employees",1991,Colombo,Colombo,Colombo
1693,oshadha-dammearachchi-32aa49160,Senior Executive - Business Analyst,May 2023,present,gaia greenenergy holdings,8 months,Solar Electric Power Generation,11-50 employees,,Colombo,Colombo,Colombo
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,Aug 2022,Apr 2023,brandix,9 months,Retail Apparel and Fashion,"10,001+ employees",2002,,,
1695,oshadha-dammearachchi-32aa49160,News Anchor,Mar 2019,Aug 2022,tnl radio network (pvt) limited,3 years 6 months,Broadcast Media Production and Distribution,51-200 employees,1993,"Colombo 03, Western Province",Colombo,WesternProvince


In [12]:
# # Defining country mapping function
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

def get_country_from_city(city_name):
    geolocator = Nominatim(user_agent="city-to-country")
    
    try:
        location = geolocator.geocode(city_name, timeout=10)  # Adjust the timeout value as needed
        if location:
            country_name = location.address.split(",")[-1].strip()
            if country_name == "ශ්‍රී ලංකාව இலங்கை":
                country_name = "Sri Lanka"
            elif country_name == "Italia":
                country_name = "Unknown"
            return country_name
    except GeocoderTimedOut:
        print("Geocoding service timed out. Retrying...")
        return get_country_from_city(city_name)

    return 'unidentified'

In [14]:
# Assuming df is your original DataFrame
df_list = []

# Define the maximum number of rows per smaller DataFrame
max_rows_per_df = 1000

# Calculate the number of smaller DataFrames needed
num_dfs = len(new_df) // max_rows_per_df + 1

# Split the DataFrame into smaller DataFrames
for i in range(num_dfs):
    start_index = i * max_rows_per_df
    end_index = (i + 1) * max_rows_per_df
    smaller_df = new_df.iloc[start_index:end_index]
    df_list.append(smaller_df)
    
num_dfs

2

In [16]:
df_list[1]['headquaters_part1'] = df_list[1]['headquaters_part1'].apply(lambda city: get_country_from_city(city))
df_list[1]['headquaters_part2'] = df_list[1]['headquaters_part2'].apply(lambda city: get_country_from_city(city))
df_list[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list[1]['headquaters_part1'] = df_list[1]['headquaters_part1'].apply(lambda city: get_country_from_city(city))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list[1]['headquaters_part2'] = df_list[1]['headquaters_part2'].apply(lambda city: get_country_from_city(city))


Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters,headquaters_part1,headquaters_part2
1000,uvini-athukorala-581b64226,Assistant Manager- Circularity Business Partn...,Oct 2022,present,mas holdings,1 year 3 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Sri Lanka,Sri Lanka
1001,uvini-athukorala-581b64226,Senior Executive - Environmental Sustainability,Apr 2021,Oct 2022,mas holdings,1 year 7 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Sri Lanka,Sri Lanka
1002,uvini-athukorala-581b64226,Executive - Environmental Sustainability,Dec 2017,Apr 2021,mas holdings,3 years 5 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Sri Lanka,Sri Lanka
1003,uvini-athukorala-581b64226,Trainee- Environmental Sustainability,May 2017,Nov 2017,mas holdings,7 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Sri Lanka,Sri Lanka
1004,kaanjan-selvarajah-404a02a9,Manager Merchandising,Jun 2015,Oct 2016,brandix,1 year 5 months,Retail Apparel and Fashion,"10,001+ employees",2002,,unidentified,unidentified
...,...,...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,Mar 2013,Mar 2014,hela apparel holdings,1 year 1 month,Retail Apparel and Fashion,"10,001+ employees",1991,Colombo,Sri Lanka,Sri Lanka
1693,oshadha-dammearachchi-32aa49160,Senior Executive - Business Analyst,May 2023,present,gaia greenenergy holdings,8 months,Solar Electric Power Generation,11-50 employees,,Colombo,Sri Lanka,Sri Lanka
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,Aug 2022,Apr 2023,brandix,9 months,Retail Apparel and Fashion,"10,001+ employees",2002,,unidentified,unidentified
1695,oshadha-dammearachchi-32aa49160,News Anchor,Mar 2019,Aug 2022,tnl radio network (pvt) limited,3 years 6 months,Broadcast Media Production and Distribution,51-200 employees,1993,"Colombo 03, Western Province",Sri Lanka,unidentified


In [17]:
result_df = pd.concat(df_list, ignore_index=True)
result_df

Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters,headquaters_part1,headquaters_part2
0,sagara-lakmal-3b5634103,Manager - Data Analytics & Governance,Jan 2023,present,mas holdings,10 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Sri Lanka,Sri Lanka
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,Jun 2021,Jan 2023,national development bank plc (ndb),1 year 8 months,Financial Services,"1,001-5,000 employees",1979,Colombo 02,Sri Lanka,Unknown
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,Oct 2019,Jun 2021,dialog axiata plc,1 year 9 months,Telecommunications,"1,001-5,000 employees",1993,"Colombo 2, Western Province",Sri Lanka,unidentified
3,sagara-lakmal-3b5634103,Executive - Data Analytics,Apr 2018,Sep 2019,mas holdings,1 year 6 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Sri Lanka,Sri Lanka
4,ganguli-wijewardana,Assistant Manager - Operations and Systems Ana...,Aug 2023,present,mas holdings,3 months,Apparel & Fashion,"10,001+ employees",1987,"Colombo, Colombo",Sri Lanka,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,Mar 2013,Mar 2014,hela apparel holdings,1 year 1 month,Retail Apparel and Fashion,"10,001+ employees",1991,Colombo,Sri Lanka,Sri Lanka
1693,oshadha-dammearachchi-32aa49160,Senior Executive - Business Analyst,May 2023,present,gaia greenenergy holdings,8 months,Solar Electric Power Generation,11-50 employees,,Colombo,Sri Lanka,Sri Lanka
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,Aug 2022,Apr 2023,brandix,9 months,Retail Apparel and Fashion,"10,001+ employees",2002,,unidentified,unidentified
1695,oshadha-dammearachchi-32aa49160,News Anchor,Mar 2019,Aug 2022,tnl radio network (pvt) limited,3 years 6 months,Broadcast Media Production and Distribution,51-200 employees,1993,"Colombo 03, Western Province",Sri Lanka,unidentified


In [18]:
# # Encoding sri_lankan
# new_df['headquaters'] = new_df['headquaters'].apply(lambda x: 1 if 'Sri Lanka' in x else 0)
# new_df

def is_in_sri_lanka(row):
    if 'Sri Lanka' in row['headquaters_part1'] or 'Sri Lanka' in row['headquaters_part2']:
        return 1
    else:
        return 0

result_df['headquaters'] = result_df.apply(is_in_sri_lanka, axis=1)

In [19]:
result_df.to_json('head.json', orient='records')

In [20]:
head_df = pd.read_json('head.json')
head_df

Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters,headquaters_part1,headquaters_part2
0,sagara-lakmal-3b5634103,Manager - Data Analytics & Governance,2023-01-01,present,mas holdings,10 months,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,2021-06-01,Jan 2023,national development bank plc (ndb),1 year 8 months,Financial Services,"1,001-5,000 employees",1979,1,Sri Lanka,Unknown
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,2019-10-01,Jun 2021,dialog axiata plc,1 year 9 months,Telecommunications,"1,001-5,000 employees",1993,1,Sri Lanka,unidentified
3,sagara-lakmal-3b5634103,Executive - Data Analytics,2018-04-01,Sep 2019,mas holdings,1 year 6 months,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
4,ganguli-wijewardana,Assistant Manager - Operations and Systems Ana...,2023-08-01,present,mas holdings,3 months,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,2013-03-01,Mar 2014,hela apparel holdings,1 year 1 month,Retail Apparel and Fashion,"10,001+ employees",1991,1,Sri Lanka,Sri Lanka
1693,oshadha-dammearachchi-32aa49160,Senior Executive - Business Analyst,2023-05-01,present,gaia greenenergy holdings,8 months,Solar Electric Power Generation,11-50 employees,,1,Sri Lanka,Sri Lanka
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,2022-08-01,Apr 2023,brandix,9 months,Retail Apparel and Fashion,"10,001+ employees",2002,0,unidentified,unidentified
1695,oshadha-dammearachchi-32aa49160,News Anchor,2019-03-01,Aug 2022,tnl radio network (pvt) limited,3 years 6 months,Broadcast Media Production and Distribution,51-200 employees,1993,1,Sri Lanka,unidentified


In [21]:
# # Define a function to convert duration to months
# def duration_to_months(duration):
#     # Check if the duration value is entirely a string
#     if not duration.replace(' ', '').isnumeric():
#         if "less than a year" in duration.lower():
#             return 0  
        
#     # Seperate a string such as '2 years' to parts such as 2 and years
#     total_months = 0
#     parts = duration.split()
    
#     # Loop through every item. Since one item has two parts. Loop jumps every two item
#     for i in range(0, len(parts), 2):
#         # Numeric part ix converted to int and stored in variable value.
#         value = int(parts[i])
#         # Second part is converted to lowercase and store in variable unit
#         unit = parts[i + 1].lower()
        
#         # Check the unit and add relevant number of months according to years or months
#         if 'year' in unit:
#             total_months += value * 12
#         elif 'month' in unit:
#             total_months += value

#     return total_months

def duration_to_months(duration):
    # Check if the duration value is a string
    if isinstance(duration, str):
        if "less than a year" in duration.lower():
            return 0

        # Seperate a string such as '2 years' to parts such as 2 and years
        total_months = 0
        parts = duration.split()

        # Loop through every item. Since one item has two parts. Loop jumps every two item
        for i in range(0, len(parts), 2):
            # Numeric part ix converted to int and stored in variable value.
            value = int(parts[i])
            # Second part is converted to lowercase and store in variable unit
            unit = parts[i + 1].lower()

            # Check the unit and add relevant number of months according to years or months
            if 'year' in unit:
                total_months += value * 12
            elif 'month' in unit:
                total_months += value

        return total_months
    else:
        # If it's already an integer, assume it's in months
        return duration

# Convert durations to month by calling the function on Duration column 
head_df['duration'] = head_df['duration'].apply(duration_to_months)
head_df

Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters,headquaters_part1,headquaters_part2
0,sagara-lakmal-3b5634103,Manager - Data Analytics & Governance,2023-01-01,present,mas holdings,10,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,2021-06-01,Jan 2023,national development bank plc (ndb),20,Financial Services,"1,001-5,000 employees",1979,1,Sri Lanka,Unknown
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,2019-10-01,Jun 2021,dialog axiata plc,21,Telecommunications,"1,001-5,000 employees",1993,1,Sri Lanka,unidentified
3,sagara-lakmal-3b5634103,Executive - Data Analytics,2018-04-01,Sep 2019,mas holdings,18,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
4,ganguli-wijewardana,Assistant Manager - Operations and Systems Ana...,2023-08-01,present,mas holdings,3,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,2013-03-01,Mar 2014,hela apparel holdings,13,Retail Apparel and Fashion,"10,001+ employees",1991,1,Sri Lanka,Sri Lanka
1693,oshadha-dammearachchi-32aa49160,Senior Executive - Business Analyst,2023-05-01,present,gaia greenenergy holdings,8,Solar Electric Power Generation,11-50 employees,,1,Sri Lanka,Sri Lanka
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,2022-08-01,Apr 2023,brandix,9,Retail Apparel and Fashion,"10,001+ employees",2002,0,unidentified,unidentified
1695,oshadha-dammearachchi-32aa49160,News Anchor,2019-03-01,Aug 2022,tnl radio network (pvt) limited,42,Broadcast Media Production and Distribution,51-200 employees,1993,1,Sri Lanka,unidentified


# Transorming dates and adding recency columns

In [22]:
import datetime

# Define the reference date
reference_date = pd.to_datetime('2023-01-01')

# Function to convert month-year strings to datetime objects (omitting the day)
# def convert_month_year_to_date(value):
#     try:
#         date = pd.to_datetime(value, format='%b %Y')
#         date = date.replace(day=1, month=1)
#         return date
#     except ValueError:
#         return None
    
# def convert_month_year_to_date(value):
#     try:
#         if len(value) == 4:
#             # Assume it's in "yyyy" format
#             date = pd.to_datetime(value, format='%Y')
#         elif len(value) > 4:
#             # Assume it's in "Mon yyyy" format
#             date = pd.to_datetime(value, format='%b %Y')
#         else:
#             # If the length doesn't match either format, return None
#             date = None
#         return date
#     except ValueError:
#         return None

def convert_month_year_to_date(value):
    try:
        if isinstance(value, str):
            if len(value) == 4:
                # Assume it's in "yyyy" format
                date = pd.to_datetime(value, format='%Y')
            elif len(value) > 4:
                # Assume it's in "Mon yyyy" format
                date = pd.to_datetime(value, format='%b %Y')
            else:
                # If the length doesn't match either format, return None
                date = None
        else:
            # If it's not a string, assume it's already a datetime object
            date = value
        
        return date
    except ValueError:
        return None

# Convet start time
head_df['start_time'] = head_df['start_time'].apply(convert_month_year_to_date)

# Set 'End Time' to January 1st, 2023, when marked as "present"
# new_df['end_time'] = new_df.apply(lambda row: pd.to_datetime('2023-01-01') if row['end_time'] == 'present' else convert_month_year_to_date(row['end_time']), axis=1)
head_df['end_time'] = head_df.apply(
    lambda row: pd.to_datetime('2023-01-01') if (row['end_time'] == 'present' or pd.to_datetime(row['end_time']) > pd.to_datetime('2023-01-01')) else convert_month_year_to_date(row['end_time']),
    axis=1
)

head_df

Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters,headquaters_part1,headquaters_part2
0,sagara-lakmal-3b5634103,Manager - Data Analytics & Governance,2023-01-01,2023-01-01,mas holdings,10,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,2021-06-01,2023-01-01,national development bank plc (ndb),20,Financial Services,"1,001-5,000 employees",1979,1,Sri Lanka,Unknown
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,2019-10-01,2021-06-01,dialog axiata plc,21,Telecommunications,"1,001-5,000 employees",1993,1,Sri Lanka,unidentified
3,sagara-lakmal-3b5634103,Executive - Data Analytics,2018-04-01,2019-09-01,mas holdings,18,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
4,ganguli-wijewardana,Assistant Manager - Operations and Systems Ana...,2023-08-01,2023-01-01,mas holdings,3,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...,...
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,2013-03-01,2014-03-01,hela apparel holdings,13,Retail Apparel and Fashion,"10,001+ employees",1991,1,Sri Lanka,Sri Lanka
1693,oshadha-dammearachchi-32aa49160,Senior Executive - Business Analyst,2023-05-01,2023-01-01,gaia greenenergy holdings,8,Solar Electric Power Generation,11-50 employees,,1,Sri Lanka,Sri Lanka
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,2022-08-01,2023-01-01,brandix,9,Retail Apparel and Fashion,"10,001+ employees",2002,0,unidentified,unidentified
1695,oshadha-dammearachchi-32aa49160,News Anchor,2019-03-01,2022-08-01,tnl radio network (pvt) limited,42,Broadcast Media Production and Distribution,51-200 employees,1993,1,Sri Lanka,unidentified


In [23]:
# Filter out rows that started in January 2023
head_df = head_df[head_df['start_time'].dt.year != 2023]

# Calculate the 'start_recency_months'
head_df['start_recency_months'] = (reference_date - head_df['start_time']).dt.days // 30

# Calculate the 'end_recency_months'
head_df['end_recency_months'] = (reference_date - head_df['end_time']).dt.days // 30

head_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  head_df['start_recency_months'] = (reference_date - head_df['start_time']).dt.days // 30
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  head_df['end_recency_months'] = (reference_date - head_df['end_time']).dt.days // 30


Unnamed: 0,profile_name,position,start_time,end_time,organisation,duration,industry,size,founded,headquaters,headquaters_part1,headquaters_part2,start_recency_months,end_recency_months
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,2021-06-01,2023-01-01,national development bank plc (ndb),20,Financial Services,"1,001-5,000 employees",1979,1,Sri Lanka,Unknown,19,0
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,2019-10-01,2021-06-01,dialog axiata plc,21,Telecommunications,"1,001-5,000 employees",1993,1,Sri Lanka,unidentified,39,19
3,sagara-lakmal-3b5634103,Executive - Data Analytics,2018-04-01,2019-09-01,mas holdings,18,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka,57,40
5,ganguli-wijewardana,Senior Executive - Operations & Systems Analyst,2022-04-01,2023-01-01,mas holdings,17,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka,9,0
6,ganguli-wijewardana,Executive - Operations & Systems Analyst,2019-01-01,2022-03-01,mas holdings,39,Apparel & Fashion,"10,001+ employees",1987,1,Sri Lanka,Sri Lanka,48,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691,muditha-ediriweera-mba-cim-mabe-mos-23811284,Business Manager,2014-03-01,2018-04-01,hela apparel holdings,50,Retail Apparel and Fashion,"10,001+ employees",1991,1,Sri Lanka,Sri Lanka,107,57
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,2013-03-01,2014-03-01,hela apparel holdings,13,Retail Apparel and Fashion,"10,001+ employees",1991,1,Sri Lanka,Sri Lanka,119,107
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,2022-08-01,2023-01-01,brandix,9,Retail Apparel and Fashion,"10,001+ employees",2002,0,unidentified,unidentified,5,0
1695,oshadha-dammearachchi-32aa49160,News Anchor,2019-03-01,2022-08-01,tnl radio network (pvt) limited,42,Broadcast Media Production and Distribution,51-200 employees,1993,1,Sri Lanka,unidentified,46,5


# Creating apparel_ industry column

In [24]:
# Create the 'apparel_industry' column
head_df['apparel_industry'] = head_df['industry'].apply(lambda x: 1 if 'apparel' in str(x).lower() else 0)
head_df[['profile_name', 'organisation', 'industry', 'apparel_industry']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  head_df['apparel_industry'] = head_df['industry'].apply(lambda x: 1 if 'apparel' in str(x).lower() else 0)


Unnamed: 0,profile_name,organisation,industry,apparel_industry
1,sagara-lakmal-3b5634103,national development bank plc (ndb),Financial Services,0
2,sagara-lakmal-3b5634103,dialog axiata plc,Telecommunications,0
3,sagara-lakmal-3b5634103,mas holdings,Apparel & Fashion,1
5,ganguli-wijewardana,mas holdings,Apparel & Fashion,1
6,ganguli-wijewardana,mas holdings,Apparel & Fashion,1
...,...,...,...,...
1691,muditha-ediriweera-mba-cim-mabe-mos-23811284,hela apparel holdings,Retail Apparel and Fashion,1
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,hela apparel holdings,Retail Apparel and Fashion,1
1694,oshadha-dammearachchi-32aa49160,brandix,Retail Apparel and Fashion,1
1695,oshadha-dammearachchi-32aa49160,tnl radio network (pvt) limited,Broadcast Media Production and Distribution,0


# Creating company_size column

In [25]:
# Create the 'company_size' column based on the 'size' 
size_mapping = {
    '1,000 - employees': 1,
    '1,001-5,000 employees': 2,
    '5,001-10,000 employees': 3,
    '10,001+ employees': 4
}

head_df['company_size'] = head_df['size'].map(size_mapping)
head_df[['profile_name', 'organisation', 'size', 'company_size']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  head_df['company_size'] = head_df['size'].map(size_mapping)


Unnamed: 0,profile_name,organisation,size,company_size
1,sagara-lakmal-3b5634103,national development bank plc (ndb),"1,001-5,000 employees",2.0
2,sagara-lakmal-3b5634103,dialog axiata plc,"1,001-5,000 employees",2.0
3,sagara-lakmal-3b5634103,mas holdings,"10,001+ employees",4.0
5,ganguli-wijewardana,mas holdings,"10,001+ employees",4.0
6,ganguli-wijewardana,mas holdings,"10,001+ employees",4.0
...,...,...,...,...
1691,muditha-ediriweera-mba-cim-mabe-mos-23811284,hela apparel holdings,"10,001+ employees",4.0
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,hela apparel holdings,"10,001+ employees",4.0
1694,oshadha-dammearachchi-32aa49160,brandix,"10,001+ employees",4.0
1695,oshadha-dammearachchi-32aa49160,tnl radio network (pvt) limited,51-200 employees,


# Creating company_age_years column

In [26]:
# Calculate the 'company_age_years' column,
head_df['founded'] = pd.to_numeric(head_df['founded'])
head_df['company_age_years'] = head_df['founded'].apply(lambda x: 0 if x == 0 else 2023 - x)

head_df[['profile_name', 'organisation', 'founded', 'company_age_years']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  head_df['founded'] = pd.to_numeric(head_df['founded'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  head_df['company_age_years'] = head_df['founded'].apply(lambda x: 0 if x == 0 else 2023 - x)


Unnamed: 0,profile_name,organisation,founded,company_age_years
1,sagara-lakmal-3b5634103,national development bank plc (ndb),1979.0,44.0
2,sagara-lakmal-3b5634103,dialog axiata plc,1993.0,30.0
3,sagara-lakmal-3b5634103,mas holdings,1987.0,36.0
5,ganguli-wijewardana,mas holdings,1987.0,36.0
6,ganguli-wijewardana,mas holdings,1987.0,36.0
...,...,...,...,...
1691,muditha-ediriweera-mba-cim-mabe-mos-23811284,hela apparel holdings,1991.0,32.0
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,hela apparel holdings,1991.0,32.0
1694,oshadha-dammearachchi-32aa49160,brandix,2002.0,21.0
1695,oshadha-dammearachchi-32aa49160,tnl radio network (pvt) limited,1993.0,30.0


In [27]:
# Create a new DataFrame with selected columns
selected_columns = ['profile_name', 'position', 'start_time', 'organisation' ,'duration', 'start_recency_months', 'end_recency_months', 'apparel_industry', 'company_size','headquaters', 'company_age_years']
new_selected_df = head_df[selected_columns]

new_selected_df

Unnamed: 0,profile_name,position,start_time,organisation,duration,start_recency_months,end_recency_months,apparel_industry,company_size,headquaters,company_age_years
1,sagara-lakmal-3b5634103,Associate Manager - Data Modelling Lead - Corp...,2021-06-01,national development bank plc (ndb),20,19,0,0,2.0,1,44.0
2,sagara-lakmal-3b5634103,Senior Executive - Dashboarding and Visualizat...,2019-10-01,dialog axiata plc,21,39,19,0,2.0,1,30.0
3,sagara-lakmal-3b5634103,Executive - Data Analytics,2018-04-01,mas holdings,18,57,40,1,4.0,1,36.0
5,ganguli-wijewardana,Senior Executive - Operations & Systems Analyst,2022-04-01,mas holdings,17,9,0,1,4.0,1,36.0
6,ganguli-wijewardana,Executive - Operations & Systems Analyst,2019-01-01,mas holdings,39,48,10,1,4.0,1,36.0
...,...,...,...,...,...,...,...,...,...,...,...
1691,muditha-ediriweera-mba-cim-mabe-mos-23811284,Business Manager,2014-03-01,hela apparel holdings,50,107,57,1,4.0,1,32.0
1692,muditha-ediriweera-mba-cim-mabe-mos-23811284,Senior Merchandiser,2013-03-01,hela apparel holdings,13,119,107,1,4.0,1,32.0
1694,oshadha-dammearachchi-32aa49160,Executive - Finance Business Partner for ICT,2022-08-01,brandix,9,5,0,1,4.0,0,21.0
1695,oshadha-dammearachchi-32aa49160,News Anchor,2019-03-01,tnl radio network (pvt) limited,42,46,5,0,,1,30.0


# Get company change

In [28]:
# Sort the dataframe vy profile and start time
new_selected_df.sort_values(by=['profile_name', 'start_time'], ascending=[False, False], inplace=True)

# Reset indexes
new_selected_df = new_selected_df.reset_index(drop=True)

new_selected_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_selected_df.sort_values(by=['profile_name', 'start_time'], ascending=[False, False], inplace=True)


Unnamed: 0,profile_name,position,start_time,organisation,duration,start_recency_months,end_recency_months,apparel_industry,company_size,headquaters,company_age_years
0,zeenath-ahamed-3678a4b8,Manager - Organization Structure and Design,2021-04-01,mas holdings,32,21,0,1,4.0,1,36.0
1,zeenath-ahamed-3678a4b8,Assistant Manager - Organizational Strategy De...,2018-07-01,mas holdings,34,54,21,1,4.0,1,36.0
2,zeenath-ahamed-3678a4b8,Assistant Manager - Strategic Planning,2017-01-01,mas holdings,18,73,55,1,4.0,1,36.0
3,zeenath-ahamed-3678a4b8,Assistant Manager - Business Analysis and Plan...,2010-06-01,mas holdings,79,153,74,1,4.0,1,36.0
4,zeenath-ahamed-3678a4b8,Business Analyst / Senior Business Analyst,2005-06-01,mas holdings,55,214,159,1,4.0,1,36.0
...,...,...,...,...,...,...,...,...,...,...,...
1532,aaliya-mohamed-57aa79128,Senior Designer,2017-09-01,mas holdings,43,64,22,1,4.0,1,36.0
1533,aaliya-mohamed-57aa79128,Founder,2017-01-01,tofro,25,73,48,0,,0,
1534,aaliya-mohamed-57aa79128,Technical Design advisor,2016-08-01,the rib,20,78,58,0,,0,
1535,aaliya-mohamed-57aa79128,Designer,2013-01-01,mas holdings,46,121,76,1,4.0,1,36.0


In [29]:
# Initialize an empty list to store the values for the new "company_change" column
company_change = []

# Initialize a variable to keep track of the current profile
current_profile = None
previous_organization = None

# Iterate over the rows of the DataFrame in reverse order
for index in reversed(new_selected_df.index):
    row = new_selected_df.loc[index]
    if row['profile_name'] != current_profile:
        # If the profile has changed, set the company_change value to -1
        company_change.insert(0, -1)
        current_profile = row['profile_name']
        previous_organization = row['organisation']
#         previous_mas_status = row['is_mas_company']
    else:
        # Check if the organization has changed compared to the next row
        # next_row = new_selected_df.loc[new_selected_df.index[new_selected_df.index.get_loc(index) - 1]]
        if row['organisation'] != previous_organization: # and not (row['is_mas_company'] == 1 and previous_mas_status == 1):
            company_change.insert(0, 1)  # Organization changed
        else:
            if previous_organization is None:
                company_change.insert(0, -1)  # First organization in profile
            else:
                company_change.insert(0, 0)  # Organization did not change
        previous_organization = row['organisation']
#         previous_mas_status = row['is_mas_company']

# Add the "company_change" column to the DataFrame
new_selected_df['company_change'] = company_change

new_selected_df[['profile_name', 'organisation', 'company_change']].head(50)

Unnamed: 0,profile_name,organisation,company_change
0,zeenath-ahamed-3678a4b8,mas holdings,0
1,zeenath-ahamed-3678a4b8,mas holdings,0
2,zeenath-ahamed-3678a4b8,mas holdings,0
3,zeenath-ahamed-3678a4b8,mas holdings,0
4,zeenath-ahamed-3678a4b8,mas holdings,-1
5,yuwin-mestrige-9a15361a9,mas holdings,0
6,yuwin-mestrige-9a15361a9,mas holdings,0
7,yuwin-mestrige-9a15361a9,mas holdings,0
8,yuwin-mestrige-9a15361a9,mas holdings,-1
9,yoshan-de-zoysa,mas holdings,0


In [30]:
def calculate_cumulative_company_changes(df):
    # Initialize a dictionary to store the cumulative counts for each profile
    cumulative_counts = {}
    
    # Initialize a dictionary to store the set of previous organizations for each profile
    previous_organizations = {}
    
    # Initialize a variable to keep track of the current profile
    current_profile = None
    
    # Initialize a variable to store the cumulative count
    cumulative_count = 0
    
    # Iterate over the rows of the DataFrame in reverse order
    for index in reversed(df.index):
        row = df.loc[index]
        if row['company_change'] == -1:
            # If the company change is -1, reset the cumulative count to 1
            cumulative_count = 1
            current_profile = row['profile_name']
            previous_organizations[current_profile] = set()
        else:
            if row['profile_name'] != current_profile:
                # If the profile has changed, reset the cumulative count to 1
                cumulative_count = 1
                current_profile = row['profile_name']
                previous_organizations[current_profile] = set()
            
            # Check if the current organization has not occurred in previous rows of the current profile
            if row['company_change'] == 1 and row['organisation'] not in previous_organizations[current_profile]:
                cumulative_count += 1
                
        # Store the cumulative count for the current row in the dictionary
        cumulative_counts[index] = cumulative_count
        
        # Add the current organization to the set of previous organizations for the current profile
        previous_organizations[current_profile].add(row['organisation'])
    
    # Create a list of cumulative counts based on the DataFrame rows
    cumulative_count_list = [cumulative_counts[index] for index in df.index]
    
    # Add the cumulative count as a new column in the DataFrame
    df['cum_no_of_companies'] = cumulative_count_list

    return df

# Call the function to calculate cumulative company changes
new_selected_cum_df = calculate_cumulative_company_changes(new_selected_df)
new_selected_cum_df[['profile_name', 'organisation', 'company_change', 'cum_no_of_companies']].head(50)

Unnamed: 0,profile_name,organisation,company_change,cum_no_of_companies
0,zeenath-ahamed-3678a4b8,mas holdings,0,1
1,zeenath-ahamed-3678a4b8,mas holdings,0,1
2,zeenath-ahamed-3678a4b8,mas holdings,0,1
3,zeenath-ahamed-3678a4b8,mas holdings,0,1
4,zeenath-ahamed-3678a4b8,mas holdings,-1,1
5,yuwin-mestrige-9a15361a9,mas holdings,0,1
6,yuwin-mestrige-9a15361a9,mas holdings,0,1
7,yuwin-mestrige-9a15361a9,mas holdings,0,1
8,yuwin-mestrige-9a15361a9,mas holdings,-1,1
9,yoshan-de-zoysa,mas holdings,0,3


In [66]:
grade_mapping_df = pd.read_excel("grade_mapping.xlsx")
designation_mapping_df = pd.read_excel("designation mapping.xlsx")

# Convert the relevant columns to lowercase for case-insensitive matching
grade_mapping_df['Grade'] = grade_mapping_df['Grade'].str.lower()
designation_mapping_df['Designation'] = designation_mapping_df['Designation'].str.lower()
new_selected_cum_df['position'] = new_selected_cum_df['position'].str.lower()

# Sort data frames by the length of strings in descending order
grade_mapping_df = grade_mapping_df.sort_values(by='Grade', key=lambda x: x.str.len(), ascending=False)
designation_mapping_df = designation_mapping_df.sort_values(by='Designation', key=lambda x: x.str.len(), ascending=False)

# Define a function to map the job position to the corresponding job level
def get_job_level(position):
    
    if "senior" in position and "manager" not in position and "director" not in position:
        return 4 
    if "assistant" in position and "manager" not in position and "executive" not in position:
        return 1 
# Check if the position is in grade_mapping_df
    for grade in grade_mapping_df['Grade']:
        if grade in position:
            return grade_mapping_df.loc[grade_mapping_df['Grade'] == grade, 'Level'].values[0]

    # Check if the position is in designation_mapping_df
    for designation in designation_mapping_df['Designation']:
        if designation in position:
            return designation_mapping_df.loc[designation_mapping_df['Designation'] == designation, 'Level'].values[0]

    # If no match is found, return 0
    return 0

# # Define a function to map the job position to the corresponding job level
# def get_job_level(position):
#     if "assistant" in position and "manager" not in position and "director" not in position:
#         return 2
#     if "associate" in position and "manager" not in position and "director" not in position:
#         return 2
#     if "engineer" in position and "senior" not in position and "manager" not in position:
#         return 3 
#     if "senior" in position and "manager" not in position and "director" not in position:
#         return 4 
#     if "deputy general manager" in position:
#         return 7
#     if "general manager" in position:
#         return 8 
#     if "chief executive officer" in position:
#         return 10


#     # Check if the position contains the whole string of a grade from grade_mapping_df
#     for grade in grade_mapping_df['Grade']:
#         if grade in position:
#             return grade_mapping_df.loc[grade_mapping_df['Grade'] == grade, 'Level'].values[0]

#     # Check if the position is in designation_mapping_df
#     for designation in designation_mapping_df['Designation']:
#         if designation in position:           
# #             if "senior" in position and "manager" not in position and "director" not in position:
# #                 return 4
#             return designation_mapping_df.loc[designation_mapping_df['Designation'] == designation, 'Level'].values[0]
        
#     # If no match is found, return 0
#     return 0

# Apply the get_job_level function and create a new "Job Level" column
new_selected_cum_df['job_level'] = new_selected_cum_df['position'].apply(get_job_level)

pd.set_option('display.max_rows', None)
new_selected_cum_df[['profile_name', 'organisation', 'position', 'job_level']]

Unnamed: 0,profile_name,organisation,position,job_level
0,zeenath-ahamed-3678a4b8,mas holdings,manager - organization structure and design,6.0
1,zeenath-ahamed-3678a4b8,mas holdings,assistant manager - organizational strategy de...,5.0
2,zeenath-ahamed-3678a4b8,mas holdings,assistant manager - strategic planning,5.0
3,zeenath-ahamed-3678a4b8,mas holdings,assistant manager - business analysis and plan...,5.0
4,zeenath-ahamed-3678a4b8,mas holdings,business analyst / senior business analyst,4.0
5,yuwin-mestrige-9a15361a9,mas holdings,senior autonomation engineer,4.0
6,yuwin-mestrige-9a15361a9,mas holdings,autonomation engineer,3.0
7,yuwin-mestrige-9a15361a9,mas holdings,machine engineer,3.0
8,yuwin-mestrige-9a15361a9,mas holdings,executive - central manufacturing support,3.0
9,yoshan-de-zoysa,mas holdings,assistant manager - sourcing & supply chain,5.0


In [68]:
new_selected_cum_df[new_selected_cum_df['job_level'] == 0][['profile_name', 'organisation', 'position', 'job_level']]

Unnamed: 0,profile_name,organisation,position,job_level
67,veranja-gunawardena-67851993,gala coral group,part time treasurer,0.0
139,thisum-mendis-06107a3a,mas holdings,lean enterprise – master black belt,0.0
179,thilanka-ekanayake-597459178,intercollegiate sri lanka education (isle center),learning partner and translator,0.0
286,sithari-perera-8859a9a6,current residence,freelance teacher,0.0
479,ravija-hewage-339854263,unicef,writer,0.0
547,peshala-randima-withanachchi-7a1030129,"bionics laboratory, department of mechanical e...",undergraduate student researcher,0.0
574,oshadha-dammearachchi-32aa49160,tnl radio network (pvt) limited,news anchor,0.0
630,natalia-n-1809a3103,the mentor,freelance tutor/ educator,0.0
635,natalia-n-1809a3103,"lyceum international school, nugegoda.",coach of model united nations,0.0
744,malintha-thisal-628608206,university of kelaniya sri lanka,vice captain of university soccer team,0.0


In [69]:
# print(len(new_selected_cum_df[new_selected_cum_df['job_level'] == 0][['profile_name', 'organisation', 'position', 'job_level']]))

# print(new_selected_cum_df[new_selected_cum_df['job_level'] == 0][['profile_name', 'organisation', 'position', 'job_level']])

# new_designations_df = extracted_data_df[extracted_data_df['job_level'] == 0]['position']
# new_designations_df = new_designations_df.drop_duplicates()
# new_designations_df.to_csv('positions_job_level_0.csv', index=False
# new_designations_df
                           
# Removing unmapped records
new_selected_cum_df = new_selected_cum_df[new_selected_cum_df['job_level'] != 0]
new_selected_cum_df

Unnamed: 0,profile_name,position,start_time,organisation,duration,start_recency_months,end_recency_months,apparel_industry,company_size,headquaters,company_age_years,company_change,cum_no_of_companies,job_level
0,zeenath-ahamed-3678a4b8,manager - organization structure and design,2021-04-01,mas holdings,32,21,0,1,4.0,1,36.0,0,1,6.0
1,zeenath-ahamed-3678a4b8,assistant manager - organizational strategy de...,2018-07-01,mas holdings,34,54,21,1,4.0,1,36.0,0,1,5.0
2,zeenath-ahamed-3678a4b8,assistant manager - strategic planning,2017-01-01,mas holdings,18,73,55,1,4.0,1,36.0,0,1,5.0
3,zeenath-ahamed-3678a4b8,assistant manager - business analysis and plan...,2010-06-01,mas holdings,79,153,74,1,4.0,1,36.0,0,1,5.0
4,zeenath-ahamed-3678a4b8,business analyst / senior business analyst,2005-06-01,mas holdings,55,214,159,1,4.0,1,36.0,-1,1,4.0
5,yuwin-mestrige-9a15361a9,senior autonomation engineer,2022-04-01,mas holdings,10,9,0,1,4.0,1,36.0,0,1,4.0
6,yuwin-mestrige-9a15361a9,autonomation engineer,2020-06-01,mas holdings,23,31,9,1,4.0,1,36.0,0,1,3.0
7,yuwin-mestrige-9a15361a9,machine engineer,2019-03-01,mas holdings,16,46,31,1,4.0,1,36.0,0,1,3.0
8,yuwin-mestrige-9a15361a9,executive - central manufacturing support,2017-02-01,mas holdings,26,72,46,1,4.0,1,36.0,-1,1,3.0
9,yoshan-de-zoysa,assistant manager - sourcing & supply chain,2021-04-01,mas holdings,33,21,0,1,4.0,1,36.0,0,3,5.0


In [70]:
def calculate_level_up(df):
    # Initialize an empty list to store the values for the new "company_change" column
    level_ups = []

    # Initialize a variable to keep track of the current profile
    current_profile = None
    previous_job_level = None

    # Iterate over the rows of the DataFrame in reverse order
    for index in reversed(df.index):
        row = df.loc[index]
        if row['profile_name'] != current_profile:
            # If the profile has changed, set the company_change value to -1
            level_ups.insert(0, -1)
            current_profile = row['profile_name']
            previous_job_level = row['job_level']  # Reset the previous_organization
        else:
            # Check if the organization has changed compared to the next row
            # next_row = new_selected_df.loc[new_selected_df.index[new_selected_df.index.get_loc(index) - 1]]
            if row['job_level'] != 0:
                level_up = row['job_level'] - previous_job_level
                level_ups.insert(0, level_up)  # Organization changed
                previous_job_level = row['job_level']
            else:
                level_up = -5
                level_ups.insert(0, level_up)
                continue 
    
    df['level_up'] = level_ups

    return df

# Call the function to calculate the level up column
new_selected_lvl_df = calculate_level_up(new_selected_cum_df)
new_selected_lvl_df[['profile_name', 'organisation','company_change', 'cum_no_of_companies', 'job_level', 'level_up']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['level_up'] = level_ups


Unnamed: 0,profile_name,organisation,company_change,cum_no_of_companies,job_level,level_up
0,zeenath-ahamed-3678a4b8,mas holdings,0,1,6.0,1.0
1,zeenath-ahamed-3678a4b8,mas holdings,0,1,5.0,0.0
2,zeenath-ahamed-3678a4b8,mas holdings,0,1,5.0,0.0
3,zeenath-ahamed-3678a4b8,mas holdings,0,1,5.0,1.0
4,zeenath-ahamed-3678a4b8,mas holdings,-1,1,4.0,-1.0
5,yuwin-mestrige-9a15361a9,mas holdings,0,1,4.0,1.0
6,yuwin-mestrige-9a15361a9,mas holdings,0,1,3.0,0.0
7,yuwin-mestrige-9a15361a9,mas holdings,0,1,3.0,0.0
8,yuwin-mestrige-9a15361a9,mas holdings,-1,1,3.0,-1.0
9,yoshan-de-zoysa,mas holdings,0,3,5.0,1.0


In [71]:
def calculate_lateral_movements(df):
    # Initialize an empty list to store the values for the new "company_change" column
    lateral_movements = []

    # Initialize a variable to keep track of the current profile
    current_profile = None

    # Iterate over the rows of the DataFrame in reverse order
    for index in reversed(df.index):
        row = df.loc[index]
        if row['profile_name'] != current_profile:
            # If the profile has changed, set the company_change value to -1
            lateral_movements.insert(0, -1)
            current_profile = row['profile_name']
        else:
            # Check if the organization has changed compared to the next row
            # next_row = new_selected_df.loc[new_selected_df.index[new_selected_df.index.get_loc(index) - 1]]
            if row['company_change'] == 0 and row['level_up'] == 0:
                lateral_movements.insert(0, 1)  # Organization changed
            else:
                lateral_movements.insert(0, 0)
    
    df['lateral_movements'] = lateral_movements

    return df

new_selected_lm_df = calculate_lateral_movements(new_selected_lvl_df)
new_selected_lm_df[['profile_name', 'organisation','job_level', 'company_change', 'level_up', 'lateral_movements']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lateral_movements'] = lateral_movements


Unnamed: 0,profile_name,organisation,job_level,company_change,level_up,lateral_movements
0,zeenath-ahamed-3678a4b8,mas holdings,6.0,0,1.0,0
1,zeenath-ahamed-3678a4b8,mas holdings,5.0,0,0.0,1
2,zeenath-ahamed-3678a4b8,mas holdings,5.0,0,0.0,1
3,zeenath-ahamed-3678a4b8,mas holdings,5.0,0,1.0,0
4,zeenath-ahamed-3678a4b8,mas holdings,4.0,-1,-1.0,-1
5,yuwin-mestrige-9a15361a9,mas holdings,4.0,0,1.0,0
6,yuwin-mestrige-9a15361a9,mas holdings,3.0,0,0.0,1
7,yuwin-mestrige-9a15361a9,mas holdings,3.0,0,0.0,1
8,yuwin-mestrige-9a15361a9,mas holdings,3.0,-1,-1.0,-1
9,yoshan-de-zoysa,mas holdings,5.0,0,1.0,0


In [72]:
columns_to_drop = ['start_time', 'position']
new_selected_lm_df.drop(columns=columns_to_drop, inplace=True)

# Reorder the columns in the DataFrame
desired_order = ['profile_name', 'organisation', 'duration',
       'start_recency_months', 'end_recency_months', 'job_level', 'company_change',
       'cum_no_of_companies', 'level_up', 'lateral_movements', 'apparel_industry',
       'company_size', 'company_age_years', 'headquaters', ]

new_selected_lm_df = new_selected_lm_df[desired_order]
new_selected_lm_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_selected_lm_df.drop(columns=columns_to_drop, inplace=True)


Unnamed: 0,profile_name,organisation,duration,start_recency_months,end_recency_months,job_level,company_change,cum_no_of_companies,level_up,lateral_movements,apparel_industry,company_size,company_age_years,headquaters
0,zeenath-ahamed-3678a4b8,mas holdings,32,21,0,6.0,0,1,1.0,0,1,4.0,36.0,1
1,zeenath-ahamed-3678a4b8,mas holdings,34,54,21,5.0,0,1,0.0,1,1,4.0,36.0,1
2,zeenath-ahamed-3678a4b8,mas holdings,18,73,55,5.0,0,1,0.0,1,1,4.0,36.0,1
3,zeenath-ahamed-3678a4b8,mas holdings,79,153,74,5.0,0,1,1.0,0,1,4.0,36.0,1
4,zeenath-ahamed-3678a4b8,mas holdings,55,214,159,4.0,-1,1,-1.0,-1,1,4.0,36.0,1
5,yuwin-mestrige-9a15361a9,mas holdings,10,9,0,4.0,0,1,1.0,0,1,4.0,36.0,1
6,yuwin-mestrige-9a15361a9,mas holdings,23,31,9,3.0,0,1,0.0,1,1,4.0,36.0,1
7,yuwin-mestrige-9a15361a9,mas holdings,16,46,31,3.0,0,1,0.0,1,1,4.0,36.0,1
8,yuwin-mestrige-9a15361a9,mas holdings,26,72,46,3.0,-1,1,-1.0,-1,1,4.0,36.0,1
9,yoshan-de-zoysa,mas holdings,33,21,0,5.0,0,3,1.0,0,1,4.0,36.0,1


In [73]:
new_selected_lm_df.to_csv(r'encoded_data\experience_encoded.csv', index=False)

final_df = pd.read_csv('final_company_mapping_unlimited.csv')
final_df

Unnamed: 0,profile_name,organisation,duration,start_recency_months,end_recency_months,job_level,company_change,cum_no_of_companies,level_up,lateral_movements,apparel_industry,company_size,company_age_years,headquaters
0,zeenath-ahamed-3678a4b8,mas holdings,32,21,0,6.0,0,1,1.0,0,1,4.0,36.0,1
1,zeenath-ahamed-3678a4b8,mas holdings,34,54,21,5.0,0,1,0.0,1,1,4.0,36.0,1
2,zeenath-ahamed-3678a4b8,mas holdings,18,73,55,5.0,0,1,0.0,1,1,4.0,36.0,1
3,zeenath-ahamed-3678a4b8,mas holdings,79,153,74,5.0,0,1,1.0,0,1,4.0,36.0,1
4,zeenath-ahamed-3678a4b8,mas holdings,55,214,159,4.0,-1,1,-1.0,-1,1,4.0,36.0,1
5,yuwin-mestrige-9a15361a9,mas holdings,10,9,0,4.0,0,1,1.0,0,1,4.0,36.0,1
6,yuwin-mestrige-9a15361a9,mas holdings,23,31,9,3.0,0,1,0.0,1,1,4.0,36.0,1
7,yuwin-mestrige-9a15361a9,mas holdings,16,46,31,3.0,0,1,0.0,1,1,4.0,36.0,1
8,yuwin-mestrige-9a15361a9,mas holdings,26,72,46,3.0,-1,1,-1.0,-1,1,4.0,36.0,1
9,yoshan-de-zoysa,mas holdings,33,21,0,5.0,0,3,1.0,0,1,4.0,36.0,1


In [9]:
# Load the original data file
dfc = pd.read_csv('final_all_ml_file.csv')
dfc.head(2)

Unnamed: 0,profile_name,number_of_jobs,first_job_level,first_job_recency_months,first_job_duration_months,first_company_size,first_company_age_years,first_company_apparel,first_company_sri_lankan,last_job_level,...,last_institute_age_years,last_institute_sri_lankan,minimum_qual_duration_months,maximum_qual_duration_months,average_qual_duration_months,total_qual_duration_months,number_of_institutes,duration ratio in sri lanka_y,job_change,job_change_horizon
0,zeenath-ahamed-3678a4b8,5.0,4.0,214.0,55.0,4.0,36.0,1.0,1.0,6.0,...,0.0,0.0,48.0,168.0,108.0,216.0,2.0,0.0,0,0
1,yuwin-mestrige-9a15361a9,4.0,3.0,72.0,26.0,4.0,36.0,1.0,1.0,4.0,...,0.0,0.0,12.0,156.0,72.0,216.0,3.0,0.222222,1,1


In [11]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321 entries, 0 to 320
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   profile_name                   321 non-null    object 
 1   number_of_jobs                 321 non-null    float64
 2   first_job_level                321 non-null    float64
 3   first_job_recency_months       321 non-null    float64
 4   first_job_duration_months      321 non-null    float64
 5   first_company_size             321 non-null    float64
 6   first_company_age_years        321 non-null    float64
 7   first_company_apparel          321 non-null    float64
 8   first_company_sri_lankan       321 non-null    float64
 9   last_job_level                 321 non-null    float64
 10  last_job_recency_months        321 non-null    float64
 11  last_job_duration_months       321 non-null    float64
 12  last_company_size              321 non-null    flo

In [13]:
# Load the original data file
dfcs = pd.read_csv(r'C:\Users\dishans\OneDrive - MAS Holdings (Pvt) Ltd\HR Project\Model Final\final_all_ml_file_321_1000.csv')
dfcs.head(2)

Unnamed: 0,number_of_jobs,first_job_level,first_job_recency_months,first_job_duration_months,first_company_size,first_company_age_years,first_company_apparel,first_company_sri_lankan,last_job_level,last_job_recency_months,...,last_institute_age_years,last_institute_sri_lankan,minimum_qual_duration_months,maximum_qual_duration_months,average_qual_duration_months,total_qual_duration_months,number_of_institutes,duration ratio in sri lanka_y,job_change,job_change_horizon
0,5,4,214,55,4,36,1,1,6,21,...,0,0,48,168,108.0,216,2,0.0,0,0
1,4,3,72,26,4,36,1,1,4,9,...,0,0,12,156,72.0,216,3,0.222222,1,1


In [14]:
dfcs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321 entries, 0 to 1320
Data columns (total 50 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   number_of_jobs                 1321 non-null   int64  
 1   first_job_level                1321 non-null   int64  
 2   first_job_recency_months       1321 non-null   int64  
 3   first_job_duration_months      1321 non-null   int64  
 4   first_company_size             1321 non-null   int64  
 5   first_company_age_years        1321 non-null   int64  
 6   first_company_apparel          1321 non-null   int64  
 7   first_company_sri_lankan       1321 non-null   int64  
 8   last_job_level                 1321 non-null   int64  
 9   last_job_recency_months        1321 non-null   int64  
 10  last_job_duration_months       1321 non-null   int64  
 11  last_company_size              1321 non-null   int64  
 12  last_company_age_years         1321 non-null   i

In [15]:
dff = pd.read_csv(r'C:\Users\dishans\OneDrive - MAS Holdings (Pvt) Ltd\HR Project\basic-scrapy-project\encoded_data\final_dataset.csv')
dff.head(2)

Unnamed: 0,profile_name,number_of_jobs,first_job_level,first_job_recency_months,first_job_duration_months,first_company_size,first_company_age_years,first_company_apparel,first_company_sri_lankan,last_job_level,...,last_qual_duration_months,last_institute_size,last_institute_age_years,last_institute_sri_lankan,minimum_qual_duration_months,maximum_qual_duration_months,average_qual_duration_months,total_qual_duration_months,number_of_institutes,duration ratio in sri lanka_y
0,dulari-hansika,12,1,75,4,,,0,0,1,...,24,,,0,24,144,84.0,168,2,0.0


In [16]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 49 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   profile_name                   1 non-null      object 
 1   number_of_jobs                 1 non-null      int64  
 2   first_job_level                1 non-null      int64  
 3   first_job_recency_months       1 non-null      int64  
 4   first_job_duration_months      1 non-null      int64  
 5   first_company_size             0 non-null      float64
 6   first_company_age_years        0 non-null      float64
 7   first_company_apparel          1 non-null      int64  
 8   first_company_sri_lankan       1 non-null      int64  
 9   last_job_level                 1 non-null      int64  
 10  last_job_recency_months        1 non-null      int64  
 11  last_job_duration_months       1 non-null      int64  
 12  last_company_size              0 non-null      float64

In [17]:
unique_columns_dfcs = dfcs.columns.difference(dff.columns)
unique_columns_dff = dff.columns.difference(dfcs.columns)


In [18]:
unique_columns_dfcs

Index(['job_change', 'job_change_horizon'], dtype='object')

In [19]:
unique_columns_dff

Index(['profile_name'], dtype='object')

In [20]:
unique_columns_1 = dfcs.columns.difference(dfc.columns)
unique_columns_2 = dfc.columns.difference(dfcs.columns)

In [21]:
print(unique_columns_1)
print(unique_columns_2)

Index([], dtype='object')
Index(['profile_name'], dtype='object')
