In [1]:
import pandas as pd

file_path_jsonl = r'data\profiles_sagara_harsha.jsonl'

# Read the JSONL file
df = pd.read_json(file_path_jsonl, lines=True)
df

Unnamed: 0,profile,name,experience,education
0,sagara-lakmal-3b5634103,Sagara Lakmal,[{'position': 'Manager - Data Analytics & Gove...,[{'organisation': 'University of Colombo Schoo...
1,harsha-karunanayake-875636a8,Harsha Karunanayake,[{'position': 'Process Improvement Consultant'...,[{'organisation': 'University of Kelaniya Sri ...


In [2]:
print(df.columns)

Index(['profile', 'name', 'experience', 'education'], dtype='object')


In [3]:
import ast
import numpy as np

df['experience'] = df['experience'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

# Use ast.literal_eval to safely parse the single-quoted JSON strings
# df['experience'] = df['experience'].apply(ast.literal_eval)

# Create empty lists to store extracted data
profile_names = []
organization_profiles = []
positions = []
organisation =[]
start_times = []
end_times = []
durations = []

for idx, row in df.iterrows():
    name = row['profile']  # Get the 'name' 
    experience_data = row['experience']
    
    for exp in experience_data:
        profile_names.append(name)
        organization_profiles.append(exp.get("organisation_profile", ""))
        positions.append(exp.get("position", ""))
        organisation.append(exp.get("organisation", ""))
        start_times.append(exp.get("start_time", ""))
        end_times.append(exp.get("end_time", ""))
        durations.append(exp.get("duration", ""))

extracted_data_df = pd.DataFrame({
    'Profile Name': profile_names,
    'Organization Profile': organization_profiles,
    'Position': positions,
    'organisation':organisation,
    'Start Time': start_times,
    'End Time': end_times,
    'Duration': durations
})

extracted_data_df

Unnamed: 0,Profile Name,Organization Profile,Position,organisation,Start Time,End Time,Duration
0,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Manager - Data Analytics & Governance,MAS Holdings,Jan 2023,present,10 months
1,sagara-lakmal-3b5634103,https://www.linkedin.com/company/ndbbank,Associate Manager - Data Modelling Lead - Corp...,National Development Bank PLC (NDB),Jun 2021,Jan 2023,1 year 8 months
2,sagara-lakmal-3b5634103,https://www.linkedin.com/company/dialog-axiata...,Senior Executive - Dashboarding and Visualizat...,Dialog Axiata PLC,Oct 2019,Jun 2021,1 year 9 months
3,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Executive - Data Analytics,MAS Holdings,Apr 2018,Sep 2019,1 year 6 months
4,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Process Improvement Consultant,MAS Holdings,Apr 2022,present,1 year 8 months
5,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Senior Business Analyst,MAS Holdings,Apr 2021,Apr 2022,1 year 1 month
6,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Business Analyst,MAS Holdings,Apr 2019,Apr 2021,2 years 1 month
7,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Merchandiser,MAS Holdings,May 2018,May 2019,1 year 1 month
8,harsha-karunanayake-875636a8,,Co-Founder,Kwavz,Aug 2016,Jan 2021,4 years 6 months
9,harsha-karunanayake-875636a8,,Academic Content Writer,Freelance,Apr 2019,2021,2 years


In [4]:
# Define a function to convert duration to months
def duration_to_months(duration):
    # Check if the duration value is entirely a string
    if not duration.replace(' ', '').isnumeric():
        if "less than a year" in duration.lower():
            return 0  
        
    # Seperate a string such as '2 years' to parts such as 2 and years
    total_months = 0
    parts = duration.split()
    
    # Loop through every item. Since one item has two parts. Loop jumps every two item
    for i in range(0, len(parts), 2):
        # Numeric part ix converted to int and stored in variable value.
        value = int(parts[i])
        # Second part is converted to lowercase and store in variable unit
        unit = parts[i + 1].lower()
        
        # Check the unit and add relevant number of months according to years or months
        if 'year' in unit:
            total_months += value * 12
        elif 'month' in unit:
            total_months += value

    return total_months

In [137]:
# Convert durations to month by calling the function on Duration column 
extracted_data_df['Duration'] = extracted_data_df['Duration'].apply(duration_to_months)
extracted_data_df

Unnamed: 0,Profile Name,Organization Profile,Position,organisation,Start Time,End Time,Duration
0,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Manager - Data Analytics & Governance,MAS Holdings,Jan 2023,present,10
1,sagara-lakmal-3b5634103,https://www.linkedin.com/company/ndbbank,Associate Manager - Data Modelling Lead - Corp...,National Development Bank PLC (NDB),Jun 2021,Jan 2023,20
2,sagara-lakmal-3b5634103,https://www.linkedin.com/company/dialog-axiata...,Senior Executive - Dashboarding and Visualizat...,Dialog Axiata PLC,Oct 2019,Jun 2021,21
3,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Executive - Data Analytics,MAS Holdings,Apr 2018,Sep 2019,18
4,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Process Improvement Consultant,MAS Holdings,Apr 2022,present,20
5,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Senior Business Analyst,MAS Holdings,Apr 2021,Apr 2022,13
6,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Business Analyst,MAS Holdings,Apr 2019,Apr 2021,25
7,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Merchandiser,MAS Holdings,May 2018,May 2019,13
8,harsha-karunanayake-875636a8,,Co-Founder,Kwavz,Aug 2016,Jan 2021,54
9,harsha-karunanayake-875636a8,,Academic Content Writer,Freelance,Apr 2019,2021,24


In [138]:
# Get the job level
mapping_excel_path =r'Grade_Designation_Mappings.xlsx' 
mapping_df = pd.read_excel(mapping_excel_path)

def get_level(row):
    position = row['Position']

    if not pd.isna(position):        
        grade_match = mapping_df[mapping_df['Grade'].str.contains(position, case=False)]

        if not grade_match.empty:
            return grade_match['Level'].values[0]

        # If not found in 'Grade', check 'Position' in the 'Designation' column
        position_match = mapping_df[mapping_df['Designation'].str.contains(position, case=False)]
        
        if not position_match.empty:
            return position_match['Level'].values[0]
      
        for word, level in default_levels.items():
            if word in position.lower():
                return level      
    
    return 0

# Define the default levels
default_levels = {
    "intern": 1,
    "junior executive": 2,
    "executive": 3,
    "senior executive": 4,
    "assistant manager": 5,
    "associate manager":5,
    "manager": 6,
    "deputy general manager": 7,
    "general manager": 8,
    "director": 9,
    "chief officer": 10
}

extracted_data_df['Job Level'] = extracted_data_df.apply(get_level, axis=1)
extracted_data_df

  grade_match = mapping_df[mapping_df['Grade'].str.contains(position, case=False)]
  position_match = mapping_df[mapping_df['Designation'].str.contains(position, case=False)]


Unnamed: 0,Profile Name,Organization Profile,Position,organisation,Start Time,End Time,Duration,Job Level
0,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Manager - Data Analytics & Governance,MAS Holdings,Jan 2023,present,10,6
1,sagara-lakmal-3b5634103,https://www.linkedin.com/company/ndbbank,Associate Manager - Data Modelling Lead - Corp...,National Development Bank PLC (NDB),Jun 2021,Jan 2023,20,5
2,sagara-lakmal-3b5634103,https://www.linkedin.com/company/dialog-axiata...,Senior Executive - Dashboarding and Visualizat...,Dialog Axiata PLC,Oct 2019,Jun 2021,21,3
3,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Executive - Data Analytics,MAS Holdings,Apr 2018,Sep 2019,18,3
4,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Process Improvement Consultant,MAS Holdings,Apr 2022,present,20,5
5,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Senior Business Analyst,MAS Holdings,Apr 2021,Apr 2022,13,4
6,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Business Analyst,MAS Holdings,Apr 2019,Apr 2021,25,3
7,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Merchandiser,MAS Holdings,May 2018,May 2019,13,3
8,harsha-karunanayake-875636a8,,Co-Founder,Kwavz,Aug 2016,Jan 2021,54,0
9,harsha-karunanayake-875636a8,,Academic Content Writer,Freelance,Apr 2019,2021,24,0


In [139]:
extracted_data_df.iloc[11]['Position']

'Financial Crime Compliance (FCC) Analyst'

In [140]:
file_path = r'data\companies_sagara_harsha.jsonl'

df2 = pd.read_json(file_path, lines=True)
df2

Unnamed: 0,name,summary,url,industry,size,headquaters,type,founded,specialties,specialities
0,MAS Holdings,Change Is Courage,https://www.linkedin.com/redir/redirect,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,,"Intimate Wear, Activewear and Performance wear...",
1,Dialog Axiata PLC,Dialog Axiata is Sri Lanka's premier connectiv...,http://www.dialog.lk,Telecommunications,"1,001-5,000 employees","Colombo 2, Western Province",Public Company,1993.0,,"Telecommunication, Broadband, Digital TV, and ..."
2,National Development Bank PLC (NDB),NDB Bank offers a wide range of commercial ban...,http://www.ndbbank.com,Financial Services,"1,001-5,000 employees",Colombo 02,Public Company,1979.0,,"Retail Banking, Treasury, Cash Management Serv..."
3,HSBC,,https://www.linkedin.com/redir/redirect,Financial Services,"10,001+ employees",London,Public Company,,"Banking, Financial Services, International Fin...",


In [141]:
# Convert 'organisation' column to lowercase 
extracted_data_df.iloc[:, 3] = extracted_data_df.iloc[:, 3].str.lower()
df2.iloc[:, 0] = df2.iloc[:, 0].str.lower()

# Merge the DataFrames based on the lowercase 'organisation' column 
merged_df = pd.merge(extracted_data_df, df2, left_on=extracted_data_df.columns[3], right_on=df2.columns[0], how='left')

merged_df

Unnamed: 0,Profile Name,Organization Profile,Position,organisation,Start Time,End Time,Duration,Job Level,name,summary,url,industry,size,headquaters,type,founded,specialties,specialities
0,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Manager - Data Analytics & Governance,mas holdings,Jan 2023,present,10,6,mas holdings,Change Is Courage,https://www.linkedin.com/redir/redirect,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,,"Intimate Wear, Activewear and Performance wear...",
1,sagara-lakmal-3b5634103,https://www.linkedin.com/company/ndbbank,Associate Manager - Data Modelling Lead - Corp...,national development bank plc (ndb),Jun 2021,Jan 2023,20,5,national development bank plc (ndb),NDB Bank offers a wide range of commercial ban...,http://www.ndbbank.com,Financial Services,"1,001-5,000 employees",Colombo 02,Public Company,1979.0,,"Retail Banking, Treasury, Cash Management Serv..."
2,sagara-lakmal-3b5634103,https://www.linkedin.com/company/dialog-axiata...,Senior Executive - Dashboarding and Visualizat...,dialog axiata plc,Oct 2019,Jun 2021,21,3,dialog axiata plc,Dialog Axiata is Sri Lanka's premier connectiv...,http://www.dialog.lk,Telecommunications,"1,001-5,000 employees","Colombo 2, Western Province",Public Company,1993.0,,"Telecommunication, Broadband, Digital TV, and ..."
3,sagara-lakmal-3b5634103,https://www.linkedin.com/company/mas-holdings,Executive - Data Analytics,mas holdings,Apr 2018,Sep 2019,18,3,mas holdings,Change Is Courage,https://www.linkedin.com/redir/redirect,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,,"Intimate Wear, Activewear and Performance wear...",
4,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Process Improvement Consultant,mas holdings,Apr 2022,present,20,5,mas holdings,Change Is Courage,https://www.linkedin.com/redir/redirect,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,,"Intimate Wear, Activewear and Performance wear...",
5,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Senior Business Analyst,mas holdings,Apr 2021,Apr 2022,13,4,mas holdings,Change Is Courage,https://www.linkedin.com/redir/redirect,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,,"Intimate Wear, Activewear and Performance wear...",
6,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Business Analyst,mas holdings,Apr 2019,Apr 2021,25,3,mas holdings,Change Is Courage,https://www.linkedin.com/redir/redirect,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,,"Intimate Wear, Activewear and Performance wear...",
7,harsha-karunanayake-875636a8,https://www.linkedin.com/company/mas-holdings,Merchandiser,mas holdings,May 2018,May 2019,13,3,mas holdings,Change Is Courage,https://www.linkedin.com/redir/redirect,Apparel & Fashion,"10,001+ employees","Colombo, Colombo",Privately Held,,"Intimate Wear, Activewear and Performance wear...",
8,harsha-karunanayake-875636a8,,Co-Founder,kwavz,Aug 2016,Jan 2021,54,0,,,,,,,,,,
9,harsha-karunanayake-875636a8,,Academic Content Writer,freelance,Apr 2019,2021,24,0,,,,,,,,,,


In [142]:
# Select
selected_columns = ['Profile Name', 'Start Time', 'End Time', 'organisation','Duration', 'Job Level', 'industry', 'size', 'founded','headquaters']

# Create a new DataFrame 
new_df = merged_df[selected_columns]

In [143]:
new_df

Unnamed: 0,Profile Name,Start Time,End Time,organisation,Duration,Job Level,industry,size,founded,headquaters
0,sagara-lakmal-3b5634103,Jan 2023,present,mas holdings,10,6,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
1,sagara-lakmal-3b5634103,Jun 2021,Jan 2023,national development bank plc (ndb),20,5,Financial Services,"1,001-5,000 employees",1979.0,Colombo 02
2,sagara-lakmal-3b5634103,Oct 2019,Jun 2021,dialog axiata plc,21,3,Telecommunications,"1,001-5,000 employees",1993.0,"Colombo 2, Western Province"
3,sagara-lakmal-3b5634103,Apr 2018,Sep 2019,mas holdings,18,3,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
4,harsha-karunanayake-875636a8,Apr 2022,present,mas holdings,20,5,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
5,harsha-karunanayake-875636a8,Apr 2021,Apr 2022,mas holdings,13,4,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
6,harsha-karunanayake-875636a8,Apr 2019,Apr 2021,mas holdings,25,3,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
7,harsha-karunanayake-875636a8,May 2018,May 2019,mas holdings,13,3,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
8,harsha-karunanayake-875636a8,Aug 2016,Jan 2021,kwavz,54,0,,,,
9,harsha-karunanayake-875636a8,Apr 2019,2021,freelance,24,0,,,,


In [144]:
new_df['organisation'] = new_df['organisation'].apply(lambda x: '0' if x == '' else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['organisation'] = new_df['organisation'].apply(lambda x: '0' if x == '' else x)


In [145]:
new_df

Unnamed: 0,Profile Name,Start Time,End Time,organisation,Duration,Job Level,industry,size,founded,headquaters
0,sagara-lakmal-3b5634103,Jan 2023,present,mas holdings,10,6,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
1,sagara-lakmal-3b5634103,Jun 2021,Jan 2023,national development bank plc (ndb),20,5,Financial Services,"1,001-5,000 employees",1979.0,Colombo 02
2,sagara-lakmal-3b5634103,Oct 2019,Jun 2021,dialog axiata plc,21,3,Telecommunications,"1,001-5,000 employees",1993.0,"Colombo 2, Western Province"
3,sagara-lakmal-3b5634103,Apr 2018,Sep 2019,mas holdings,18,3,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
4,harsha-karunanayake-875636a8,Apr 2022,present,mas holdings,20,5,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
5,harsha-karunanayake-875636a8,Apr 2021,Apr 2022,mas holdings,13,4,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
6,harsha-karunanayake-875636a8,Apr 2019,Apr 2021,mas holdings,25,3,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
7,harsha-karunanayake-875636a8,May 2018,May 2019,mas holdings,13,3,Apparel & Fashion,"10,001+ employees",,"Colombo, Colombo"
8,harsha-karunanayake-875636a8,Aug 2016,Jan 2021,kwavz,54,0,,,,
9,harsha-karunanayake-875636a8,Apr 2019,2021,freelance,24,0,,,,


In [146]:
# Cleaning headquaters (Removing numbers and strings after comma).
def clean_headquarters(value):
    if pd.notna(value):
        parts = value.split(',')[0]
        return ''.join(filter(str.isalpha, parts))
    else:
        return value

new_df['headquaters'] = new_df['headquaters'].apply(clean_headquarters)
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters'] = new_df['headquaters'].apply(clean_headquarters)


Unnamed: 0,Profile Name,Start Time,End Time,organisation,Duration,Job Level,industry,size,founded,headquaters
0,sagara-lakmal-3b5634103,Jan 2023,present,mas holdings,10,6,Apparel & Fashion,"10,001+ employees",,Colombo
1,sagara-lakmal-3b5634103,Jun 2021,Jan 2023,national development bank plc (ndb),20,5,Financial Services,"1,001-5,000 employees",1979.0,Colombo
2,sagara-lakmal-3b5634103,Oct 2019,Jun 2021,dialog axiata plc,21,3,Telecommunications,"1,001-5,000 employees",1993.0,Colombo
3,sagara-lakmal-3b5634103,Apr 2018,Sep 2019,mas holdings,18,3,Apparel & Fashion,"10,001+ employees",,Colombo
4,harsha-karunanayake-875636a8,Apr 2022,present,mas holdings,20,5,Apparel & Fashion,"10,001+ employees",,Colombo
5,harsha-karunanayake-875636a8,Apr 2021,Apr 2022,mas holdings,13,4,Apparel & Fashion,"10,001+ employees",,Colombo
6,harsha-karunanayake-875636a8,Apr 2019,Apr 2021,mas holdings,25,3,Apparel & Fashion,"10,001+ employees",,Colombo
7,harsha-karunanayake-875636a8,May 2018,May 2019,mas holdings,13,3,Apparel & Fashion,"10,001+ employees",,Colombo
8,harsha-karunanayake-875636a8,Aug 2016,Jan 2021,kwavz,54,0,,,,
9,harsha-karunanayake-875636a8,Apr 2019,2021,freelance,24,0,,,,


In [129]:
# Country mapping
from geopy.geocoders import Nominatim

def get_country_from_city(city_name):
    geolocator = Nominatim(user_agent="city-to-country")
    location = geolocator.geocode(city_name)

    if location:
        country_name = location.address.split(",")[-1].strip()
        if country_name == "ශ්‍රී ලංකාව இலங்கை":
            country_name = "Sri Lanka"
        elif country_name == "Italia":
            country_name = "Unknown"
        return country_name
    else:
        return None

# for index, row in new_df.iterrows():
#     city = row['headquaters'] 
#     country = get_country_from_city(city)

#     if country:
#         print(f" {country}")
#     else:
#         print(f"0")


In [148]:
new_df['headquaters'] = new_df['headquaters'].apply(lambda city: get_country_from_city(city))
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters'] = new_df['headquaters'].apply(lambda city: get_country_from_city(city))


Unnamed: 0,Profile Name,Start Time,End Time,organisation,Duration,Job Level,industry,size,founded,headquaters
0,sagara-lakmal-3b5634103,Jan 2023,present,mas holdings,10,6,Apparel & Fashion,"10,001+ employees",,Sri Lanka
1,sagara-lakmal-3b5634103,Jun 2021,Jan 2023,national development bank plc (ndb),20,5,Financial Services,"1,001-5,000 employees",1979.0,Sri Lanka
2,sagara-lakmal-3b5634103,Oct 2019,Jun 2021,dialog axiata plc,21,3,Telecommunications,"1,001-5,000 employees",1993.0,Sri Lanka
3,sagara-lakmal-3b5634103,Apr 2018,Sep 2019,mas holdings,18,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
4,harsha-karunanayake-875636a8,Apr 2022,present,mas holdings,20,5,Apparel & Fashion,"10,001+ employees",,Sri Lanka
5,harsha-karunanayake-875636a8,Apr 2021,Apr 2022,mas holdings,13,4,Apparel & Fashion,"10,001+ employees",,Sri Lanka
6,harsha-karunanayake-875636a8,Apr 2019,Apr 2021,mas holdings,25,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
7,harsha-karunanayake-875636a8,May 2018,May 2019,mas holdings,13,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
8,harsha-karunanayake-875636a8,Aug 2016,Jan 2021,kwavz,54,0,,,,Unknown
9,harsha-karunanayake-875636a8,Apr 2019,2021,freelance,24,0,,,,Unknown


In [149]:
date_test_df = pd.DataFrame(new_df)
date_test_df

Unnamed: 0,Profile Name,Start Time,End Time,organisation,Duration,Job Level,industry,size,founded,headquaters
0,sagara-lakmal-3b5634103,Jan 2023,present,mas holdings,10,6,Apparel & Fashion,"10,001+ employees",,Sri Lanka
1,sagara-lakmal-3b5634103,Jun 2021,Jan 2023,national development bank plc (ndb),20,5,Financial Services,"1,001-5,000 employees",1979.0,Sri Lanka
2,sagara-lakmal-3b5634103,Oct 2019,Jun 2021,dialog axiata plc,21,3,Telecommunications,"1,001-5,000 employees",1993.0,Sri Lanka
3,sagara-lakmal-3b5634103,Apr 2018,Sep 2019,mas holdings,18,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
4,harsha-karunanayake-875636a8,Apr 2022,present,mas holdings,20,5,Apparel & Fashion,"10,001+ employees",,Sri Lanka
5,harsha-karunanayake-875636a8,Apr 2021,Apr 2022,mas holdings,13,4,Apparel & Fashion,"10,001+ employees",,Sri Lanka
6,harsha-karunanayake-875636a8,Apr 2019,Apr 2021,mas holdings,25,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
7,harsha-karunanayake-875636a8,May 2018,May 2019,mas holdings,13,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
8,harsha-karunanayake-875636a8,Aug 2016,Jan 2021,kwavz,54,0,,,,Unknown
9,harsha-karunanayake-875636a8,Apr 2019,2021,freelance,24,0,,,,Unknown


In [151]:
import datetime

# Define the reference date
reference_date = pd.to_datetime('2023-01-01')

# Function to convert month-year strings to datetime objects (omitting the day)
# def convert_month_year_to_date(value):
#     try:
#         date = pd.to_datetime(value, format='%b %Y')
#         date = date.replace(day=1, month=1)
#         return date
#     except ValueError:
#         return None
    
def convert_month_year_to_date(value):
    try:
        if len(value) == 4:
            # Assume it's in "yyyy" format
            date = pd.to_datetime(value, format='%Y')
        elif len(value) > 4:
            # Assume it's in "Mon yyyy" format
            date = pd.to_datetime(value, format='%b %Y')
        else:
            # If the length doesn't match either format, return None
            date = None
        return date
    except ValueError:
        return None

new_df['Start Time'] = new_df['Start Time'].apply(convert_month_year_to_date)

# Set 'End Time' to January 1st, 2023, when marked as "present"
new_df['End Time'] = new_df.apply(lambda row: pd.to_datetime('2023-01-01') if row['End Time'] == 'present' else convert_month_year_to_date(row['End Time']), axis=1)

new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Start Time'] = new_df['Start Time'].apply(convert_month_year_to_date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['End Time'] = new_df.apply(lambda row: pd.to_datetime('2023-01-01') if row['End Time'] == 'present' else convert_month_year_to_date(row['End Time']), axis=1)


Unnamed: 0,Profile Name,Start Time,End Time,organisation,Duration,Job Level,industry,size,founded,headquaters
0,sagara-lakmal-3b5634103,2023-01-01,2023-01-01,mas holdings,10,6,Apparel & Fashion,"10,001+ employees",,Sri Lanka
1,sagara-lakmal-3b5634103,2021-06-01,2023-01-01,national development bank plc (ndb),20,5,Financial Services,"1,001-5,000 employees",1979.0,Sri Lanka
2,sagara-lakmal-3b5634103,2019-10-01,2021-06-01,dialog axiata plc,21,3,Telecommunications,"1,001-5,000 employees",1993.0,Sri Lanka
3,sagara-lakmal-3b5634103,2018-04-01,2019-09-01,mas holdings,18,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
4,harsha-karunanayake-875636a8,2022-04-01,2023-01-01,mas holdings,20,5,Apparel & Fashion,"10,001+ employees",,Sri Lanka
5,harsha-karunanayake-875636a8,2021-04-01,2022-04-01,mas holdings,13,4,Apparel & Fashion,"10,001+ employees",,Sri Lanka
6,harsha-karunanayake-875636a8,2019-04-01,2021-04-01,mas holdings,25,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
7,harsha-karunanayake-875636a8,2018-05-01,2019-05-01,mas holdings,13,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka
8,harsha-karunanayake-875636a8,2016-08-01,2021-01-01,kwavz,54,0,,,,Unknown
9,harsha-karunanayake-875636a8,2019-04-01,2021-01-01,freelance,24,0,,,,Unknown


In [152]:
# Filter out rows that started in January 2023
new_df = new_df[new_df['Start Time'] != pd.to_datetime('2023-01-01')]

# Calculate the 'start_recency_months'
new_df['start_recency_months'] = (reference_date - new_df['Start Time']).dt.days // 30

# Calculate the 'end_recency_months'
new_df['end_recency_months'] = (reference_date - new_df['End Time']).dt.days // 30

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['start_recency_months'] = (reference_date - new_df['Start Time']).dt.days // 30
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['end_recency_months'] = (reference_date - new_df['End Time']).dt.days // 30


In [153]:
new_df

Unnamed: 0,Profile Name,Start Time,End Time,organisation,Duration,Job Level,industry,size,founded,headquaters,start_recency_months,end_recency_months
1,sagara-lakmal-3b5634103,2021-06-01,2023-01-01,national development bank plc (ndb),20,5,Financial Services,"1,001-5,000 employees",1979.0,Sri Lanka,19,0
2,sagara-lakmal-3b5634103,2019-10-01,2021-06-01,dialog axiata plc,21,3,Telecommunications,"1,001-5,000 employees",1993.0,Sri Lanka,39,19
3,sagara-lakmal-3b5634103,2018-04-01,2019-09-01,mas holdings,18,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka,57,40
4,harsha-karunanayake-875636a8,2022-04-01,2023-01-01,mas holdings,20,5,Apparel & Fashion,"10,001+ employees",,Sri Lanka,9,0
5,harsha-karunanayake-875636a8,2021-04-01,2022-04-01,mas holdings,13,4,Apparel & Fashion,"10,001+ employees",,Sri Lanka,21,9
6,harsha-karunanayake-875636a8,2019-04-01,2021-04-01,mas holdings,25,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka,45,21
7,harsha-karunanayake-875636a8,2018-05-01,2019-05-01,mas holdings,13,3,Apparel & Fashion,"10,001+ employees",,Sri Lanka,56,44
8,harsha-karunanayake-875636a8,2016-08-01,2021-01-01,kwavz,54,0,,,,Unknown,78,24
9,harsha-karunanayake-875636a8,2019-04-01,2021-01-01,freelance,24,0,,,,Unknown,45,24
10,harsha-karunanayake-875636a8,2017-09-01,2018-03-01,mas holdings,7,1,Apparel & Fashion,"10,001+ employees",,Sri Lanka,64,58


In [154]:
# Create the 'apparel_industry' column
new_df['apparel_industry'] = new_df['industry'].apply(lambda x: 1 if x == 'Apparel & Fashion' else 0)
new_df[['Profile Name', 'organisation', 'industry', 'apparel_industry']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['apparel_industry'] = new_df['industry'].apply(lambda x: 1 if x == 'Apparel & Fashion' else 0)


Unnamed: 0,Profile Name,organisation,industry,apparel_industry
1,sagara-lakmal-3b5634103,national development bank plc (ndb),Financial Services,0
2,sagara-lakmal-3b5634103,dialog axiata plc,Telecommunications,0
3,sagara-lakmal-3b5634103,mas holdings,Apparel & Fashion,1
4,harsha-karunanayake-875636a8,mas holdings,Apparel & Fashion,1
5,harsha-karunanayake-875636a8,mas holdings,Apparel & Fashion,1
6,harsha-karunanayake-875636a8,mas holdings,Apparel & Fashion,1
7,harsha-karunanayake-875636a8,mas holdings,Apparel & Fashion,1
8,harsha-karunanayake-875636a8,kwavz,,0
9,harsha-karunanayake-875636a8,freelance,,0
10,harsha-karunanayake-875636a8,mas holdings,Apparel & Fashion,1


In [155]:
# Create the 'company_size' column based on the 'size' 
size_mapping = {
    '1,000 - employees': 1,
    '1,001-5,000 employees': 2,
    '5,001-10,000 employees': 3,
    '10,001+ employees': 4
}

new_df['company_size'] = new_df['size'].map(size_mapping)
new_df[['Profile Name', 'organisation', 'size', 'company_size']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['company_size'] = new_df['size'].map(size_mapping)


Unnamed: 0,Profile Name,organisation,size,company_size
1,sagara-lakmal-3b5634103,national development bank plc (ndb),"1,001-5,000 employees",2.0
2,sagara-lakmal-3b5634103,dialog axiata plc,"1,001-5,000 employees",2.0
3,sagara-lakmal-3b5634103,mas holdings,"10,001+ employees",4.0
4,harsha-karunanayake-875636a8,mas holdings,"10,001+ employees",4.0
5,harsha-karunanayake-875636a8,mas holdings,"10,001+ employees",4.0
6,harsha-karunanayake-875636a8,mas holdings,"10,001+ employees",4.0
7,harsha-karunanayake-875636a8,mas holdings,"10,001+ employees",4.0
8,harsha-karunanayake-875636a8,kwavz,,
9,harsha-karunanayake-875636a8,freelance,,
10,harsha-karunanayake-875636a8,mas holdings,"10,001+ employees",4.0


In [156]:
# Encoding sri_lankan
new_df['Headquaters_in_Sri_Lanka'] = new_df['headquaters'].apply(lambda x: 1 if 'Sri Lanka' in x else 0)
new_df[['Profile Name', 'organisation', 'headquaters', 'Headquaters_in_Sri_Lanka']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Headquaters_in_Sri_Lanka'] = new_df['headquaters'].apply(lambda x: 1 if 'Sri Lanka' in x else 0)


Unnamed: 0,Profile Name,organisation,headquaters,Headquaters_in_Sri_Lanka
1,sagara-lakmal-3b5634103,national development bank plc (ndb),Sri Lanka,1
2,sagara-lakmal-3b5634103,dialog axiata plc,Sri Lanka,1
3,sagara-lakmal-3b5634103,mas holdings,Sri Lanka,1
4,harsha-karunanayake-875636a8,mas holdings,Sri Lanka,1
5,harsha-karunanayake-875636a8,mas holdings,Sri Lanka,1
6,harsha-karunanayake-875636a8,mas holdings,Sri Lanka,1
7,harsha-karunanayake-875636a8,mas holdings,Sri Lanka,1
8,harsha-karunanayake-875636a8,kwavz,Unknown,0
9,harsha-karunanayake-875636a8,freelance,Unknown,0
10,harsha-karunanayake-875636a8,mas holdings,Sri Lanka,1


In [157]:
# Calculate the 'company_age_years' column,
new_df['founded'] = pd.to_numeric(new_df['founded'])
new_df['company_age_years'] = new_df['founded'].apply(lambda x: 0 if x == 0 else 2023 - x)

new_df[['Profile Name', 'organisation', 'founded', 'company_age_years']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['founded'] = pd.to_numeric(new_df['founded'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['company_age_years'] = new_df['founded'].apply(lambda x: 0 if x == 0 else 2023 - x)


Unnamed: 0,Profile Name,organisation,founded,company_age_years
1,sagara-lakmal-3b5634103,national development bank plc (ndb),1979.0,44.0
2,sagara-lakmal-3b5634103,dialog axiata plc,1993.0,30.0
3,sagara-lakmal-3b5634103,mas holdings,,
4,harsha-karunanayake-875636a8,mas holdings,,
5,harsha-karunanayake-875636a8,mas holdings,,
6,harsha-karunanayake-875636a8,mas holdings,,
7,harsha-karunanayake-875636a8,mas holdings,,
8,harsha-karunanayake-875636a8,kwavz,,
9,harsha-karunanayake-875636a8,freelance,,
10,harsha-karunanayake-875636a8,mas holdings,,


In [158]:
# Create a new DataFrame with selected columns
selected_columns = ['Profile Name','Start Time', 'End Time', 'Job Level', 'organisation','Duration', 'start_recency_months', 'end_recency_months', 'apparel_industry', 'company_size','Headquaters_in_Sri_Lanka', 'company_age_years']
new_selected_df = new_df[selected_columns]

new_selected_df

Unnamed: 0,Profile Name,Start Time,End Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters_in_Sri_Lanka,company_age_years
1,sagara-lakmal-3b5634103,2021-06-01,2023-01-01,5,national development bank plc (ndb),20,19,0,0,2.0,1,44.0
2,sagara-lakmal-3b5634103,2019-10-01,2021-06-01,3,dialog axiata plc,21,39,19,0,2.0,1,30.0
3,sagara-lakmal-3b5634103,2018-04-01,2019-09-01,3,mas holdings,18,57,40,1,4.0,1,
4,harsha-karunanayake-875636a8,2022-04-01,2023-01-01,5,mas holdings,20,9,0,1,4.0,1,
5,harsha-karunanayake-875636a8,2021-04-01,2022-04-01,4,mas holdings,13,21,9,1,4.0,1,
6,harsha-karunanayake-875636a8,2019-04-01,2021-04-01,3,mas holdings,25,45,21,1,4.0,1,
7,harsha-karunanayake-875636a8,2018-05-01,2019-05-01,3,mas holdings,13,56,44,1,4.0,1,
8,harsha-karunanayake-875636a8,2016-08-01,2021-01-01,0,kwavz,54,78,24,0,,0,
9,harsha-karunanayake-875636a8,2019-04-01,2021-01-01,0,freelance,24,45,24,0,,0,
10,harsha-karunanayake-875636a8,2017-09-01,2018-03-01,1,mas holdings,7,64,58,1,4.0,1,


In [159]:
#company change
new_selected_df.sort_values(by=['Profile Name', 'Start Time'], ascending=[False, False], inplace=True)
new_selected_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_selected_df.sort_values(by=['Profile Name', 'Start Time'], ascending=[False, False], inplace=True)


Unnamed: 0,Profile Name,Start Time,End Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters_in_Sri_Lanka,company_age_years
1,sagara-lakmal-3b5634103,2021-06-01,2023-01-01,5,national development bank plc (ndb),20,19,0,0,2.0,1,44.0
2,sagara-lakmal-3b5634103,2019-10-01,2021-06-01,3,dialog axiata plc,21,39,19,0,2.0,1,30.0
3,sagara-lakmal-3b5634103,2018-04-01,2019-09-01,3,mas holdings,18,57,40,1,4.0,1,
4,harsha-karunanayake-875636a8,2022-04-01,2023-01-01,5,mas holdings,20,9,0,1,4.0,1,
5,harsha-karunanayake-875636a8,2021-04-01,2022-04-01,4,mas holdings,13,21,9,1,4.0,1,
6,harsha-karunanayake-875636a8,2019-04-01,2021-04-01,3,mas holdings,25,45,21,1,4.0,1,
9,harsha-karunanayake-875636a8,2019-04-01,2021-01-01,0,freelance,24,45,24,0,,0,
7,harsha-karunanayake-875636a8,2018-05-01,2019-05-01,3,mas holdings,13,56,44,1,4.0,1,
10,harsha-karunanayake-875636a8,2017-09-01,2018-03-01,1,mas holdings,7,64,58,1,4.0,1,
8,harsha-karunanayake-875636a8,2016-08-01,2021-01-01,0,kwavz,54,78,24,0,,0,


In [160]:
new_selected_df = new_selected_df.reset_index(drop=True)
new_selected_df

Unnamed: 0,Profile Name,Start Time,End Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters_in_Sri_Lanka,company_age_years
0,sagara-lakmal-3b5634103,2021-06-01,2023-01-01,5,national development bank plc (ndb),20,19,0,0,2.0,1,44.0
1,sagara-lakmal-3b5634103,2019-10-01,2021-06-01,3,dialog axiata plc,21,39,19,0,2.0,1,30.0
2,sagara-lakmal-3b5634103,2018-04-01,2019-09-01,3,mas holdings,18,57,40,1,4.0,1,
3,harsha-karunanayake-875636a8,2022-04-01,2023-01-01,5,mas holdings,20,9,0,1,4.0,1,
4,harsha-karunanayake-875636a8,2021-04-01,2022-04-01,4,mas holdings,13,21,9,1,4.0,1,
5,harsha-karunanayake-875636a8,2019-04-01,2021-04-01,3,mas holdings,25,45,21,1,4.0,1,
6,harsha-karunanayake-875636a8,2019-04-01,2021-01-01,0,freelance,24,45,24,0,,0,
7,harsha-karunanayake-875636a8,2018-05-01,2019-05-01,3,mas holdings,13,56,44,1,4.0,1,
8,harsha-karunanayake-875636a8,2017-09-01,2018-03-01,1,mas holdings,7,64,58,1,4.0,1,
9,harsha-karunanayake-875636a8,2016-08-01,2021-01-01,0,kwavz,54,78,24,0,,0,


In [161]:
# Initialize an empty list to store the values for the new "company_change" column
company_change = []

# Initialize a variable to keep track of the current profile
current_profile = None
previous_organization = None

# Iterate over the rows of the DataFrame in reverse order
for index in reversed(new_selected_df.index):
    row = new_selected_df.loc[index]
    if row['Profile Name'] != current_profile:
        # If the profile has changed, set the company_change value to -1
        company_change.insert(0, -1)
        current_profile = row['Profile Name']
        previous_organization = row['organisation']  # Reset the previous_organization
    else:
        # Check if the organization has changed compared to the next row
        # next_row = new_selected_df.loc[new_selected_df.index[new_selected_df.index.get_loc(index) - 1]]
        if row['organisation'] != previous_organization:
            company_change.insert(0, 1)  # Organization changed
        else:
            if previous_organization is None:
                company_change.insert(0, -1)  # First organization in profile
            else:
                company_change.insert(0, 0)  # Organization did not change
        previous_organization = row['organisation']

# Add the "company_change" column to the DataFrame
new_selected_df['company_change'] = company_change

In [162]:
new_selected_df[['Profile Name', 'organisation', 'company_change']]

Unnamed: 0,Profile Name,organisation,company_change
0,sagara-lakmal-3b5634103,national development bank plc (ndb),1
1,sagara-lakmal-3b5634103,dialog axiata plc,1
2,sagara-lakmal-3b5634103,mas holdings,-1
3,harsha-karunanayake-875636a8,mas holdings,0
4,harsha-karunanayake-875636a8,mas holdings,0
5,harsha-karunanayake-875636a8,mas holdings,1
6,harsha-karunanayake-875636a8,freelance,1
7,harsha-karunanayake-875636a8,mas holdings,0
8,harsha-karunanayake-875636a8,mas holdings,1
9,harsha-karunanayake-875636a8,kwavz,1


In [169]:
def calculate_cumulative_company_changes(df):
    # Initialize a dictionary to store the cumulative counts for each profile
    cumulative_counts = {}
    
    # Initialize a variable to keep track of the current profile
    current_profile = None
    
    # Iterate over the rows of the DataFrame in reverse order
    for index in reversed(df.index):
        row = df.loc[index]
        if row['Profile Name'] != current_profile:
            # If the profile has changed, reset the cumulative count to 1
            cumulative_counts[row['Profile Name']] = 1
            current_profile = row['Profile Name']
        else:
            # Check if the organization has changed compared to the next row
            next_row = df.loc[df.index[df.index.get_loc(index) - 1]]
            if row['organisation'] != next_row['organisation']:
                cumulative_counts[current_profile] += 1
        
    # Create a list of cumulative counts based on the DataFrame rows
    cumulative_count_list = [cumulative_counts[row['Profile Name']] for index, row in df.iterrows()]
    
    # Add the cumulative count as a new column in the DataFrame
    df['cumulative_company_changes'] = cumulative_count_list

    return df

In [171]:
# Call the function to calculate cumulative company changes
new_selected_c_df = calculate_cumulative_company_changes(new_selected_df)
new_selected_c_df[['Profile Name', 'organisation', 'company_change', 'cumulative_company_changes']]

Unnamed: 0,Profile Name,organisation,company_change,cumulative_company_changes
0,sagara-lakmal-3b5634103,national development bank plc (ndb),1,3
1,sagara-lakmal-3b5634103,dialog axiata plc,1,3
2,sagara-lakmal-3b5634103,mas holdings,-1,3
3,harsha-karunanayake-875636a8,mas holdings,0,4
4,harsha-karunanayake-875636a8,mas holdings,0,4
5,harsha-karunanayake-875636a8,mas holdings,1,4
6,harsha-karunanayake-875636a8,freelance,1,4
7,harsha-karunanayake-875636a8,mas holdings,0,4
8,harsha-karunanayake-875636a8,mas holdings,1,4
9,harsha-karunanayake-875636a8,kwavz,1,4


In [54]:
# # Assign 'company_change' based on 'Profile Name' and 'organisation'
# new_selected_df['company_change'] = (new_selected_df.groupby('Profile Name')['organisation']
#     .apply(lambda x: x.ne(x.shift()).cumsum() - 1)
#     .replace(0, -1))

# # Set 'company_change' to 0 when 'organisation' is null or when it's the same as the prior organization or when it's zero
# new_selected_df['company_change'] = new_selected_df['company_change'].where(
#     (new_selected_df['organisation'].notnull()) &
#     (new_selected_df['organisation'] != new_selected_df['organisation'].shift()) &
#     (new_selected_df['organisation'] != "0") &
#     (new_selected_df['organisation'] != 0),
#     0
# )

# Assign 'company_change' based on 'Profile Name' and 'organisation'
new_selected_df['company_change'] = (new_selected_df.groupby('Profile Name')['organisation']
    .apply(lambda x: x.ne(x.shift()).cumsum() - 1)
    .replace(0, -1)
    .reset_index(drop=True)  # Reset the index to match the original DataFrame
)

# Set 'company_change' to 0 when 'organisation' is null or when it's the same as the prior organization or when it's zero
new_selected_df['company_change'] = new_selected_df['company_change'].where(
    (new_selected_df['organisation'].notnull()) &
    (new_selected_df['organisation'] != new_selected_df['organisation'].shift()) &
    (new_selected_df['organisation'] != "0") &
    (new_selected_df['organisation'] != 0),
    0
)

new_selected_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_selected_df['company_change'] = (new_selected_df.groupby('Profile Name')['organisation']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_selected_df['company_change'] = new_selected_df['company_change'].where(


Unnamed: 0,Profile Name,Start Time,End Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters_in_Sri_Lanka,company_age_years,company_change
8,harsha-karunanayake-875636a8,2016-01-01,2021-01-01,0,kwavz,54,85.0,24.0,0,,0,,-1.0
10,harsha-karunanayake-875636a8,2017-01-01,2018-01-01,1,mas holdings,7,73.0,60.0,1,4.0,1,,2.0
7,harsha-karunanayake-875636a8,2018-01-01,2019-01-01,3,mas holdings,13,60.0,48.0,1,4.0,1,,0.0
6,harsha-karunanayake-875636a8,2019-01-01,2021-01-01,0,mas holdings,25,48.0,24.0,1,4.0,1,,0.0
9,harsha-karunanayake-875636a8,2019-01-01,NaT,0,freelance,24,48.0,,0,,0,,1.0
5,harsha-karunanayake-875636a8,2021-01-01,2022-01-01,0,mas holdings,13,24.0,12.0,1,4.0,1,,3.0
4,harsha-karunanayake-875636a8,2022-01-01,2023-01-01,0,mas holdings,19,12.0,0.0,1,4.0,1,,0.0
11,harsha-karunanayake-875636a8,NaT,NaT,0,hsbc,12,,,0,4.0,0,,
3,sagara-lakmal-3b5634103,2018-01-01,2019-01-01,3,mas holdings,18,60.0,48.0,1,4.0,1,,1.0
2,sagara-lakmal-3b5634103,2019-01-01,2021-01-01,3,dialog axiata plc,21,48.0,24.0,0,2.0,1,30.0,1.0


In [None]:
# Set 'company_change' to 0 when 'organisation' is null or when it's the same as the prior organization or when it's zero
new_selected_df['company_change'] = new_selected_df['company_change'].where(
    (new_selected_df['organisation'].notnull()) &
    (new_selected_df['organisation'] != new_selected_df['organisation'].shift()) &
    (new_selected_df['organisation'] != "0") &
    (new_selected_df['organisation'] != 0),
    0
)

In [353]:
new_selected_df

Unnamed: 0,Profile Name,Start Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters,company_age_years,company_change
4,harsha-karunanayake-875636a8,2016-01-01,0,0,54,85.0,24.0,0,,0,,0
6,harsha-karunanayake-875636a8,2017-01-01,1,0,7,73.0,60.0,0,,0,,0
3,harsha-karunanayake-875636a8,2018-01-01,3,mas holdings,13,60.0,48.0,1,4.0,1,36.0,1
2,harsha-karunanayake-875636a8,2019-01-01,0,mas holdings,25,48.0,24.0,1,4.0,1,36.0,0
5,harsha-karunanayake-875636a8,2019-01-01,0,0,24,48.0,,0,,0,,0
1,harsha-karunanayake-875636a8,2021-01-01,0,mas holdings,13,24.0,12.0,1,4.0,1,36.0,3
0,harsha-karunanayake-875636a8,2022-01-01,0,mas holdings,19,12.0,0.0,1,4.0,1,36.0,0
7,harsha-karunanayake-875636a8,NaT,0,hsbc,12,,,0,4.0,0,143.0,4
11,sagara-lakmal-3b5634103,2018-01-01,3,mas holdings,18,60.0,48.0,1,4.0,1,36.0,-1
10,sagara-lakmal-3b5634103,2019-01-01,3,dialog axiata plc,21,48.0,24.0,0,2.0,1,30.0,1


In [354]:
#cumulative count
new_selected_df = new_selected_df.sort_values(by=['Profile Name', 'Start Time'])

def update_cumulative_company_number(group):
    mask = (group['organisation'] != group['organisation'].shift()) & (group['organisation'].notnull()) & (group['organisation'] != zero_value)
    group['cumulative_company_number'] = mask.cumsum()
    return group

new_selected_df = new_selected_df[new_selected_df['organisation'] != zero_value].groupby('Profile Name').apply(update_cumulative_company_number).reset_index(drop=True)


In [355]:
new_selected_df

Unnamed: 0,Profile Name,Start Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters,company_age_years,company_change,cumulative_company_number
0,harsha-karunanayake-875636a8,2018-01-01,3,mas holdings,13,60.0,48.0,1,4.0,1,36.0,1,1
1,harsha-karunanayake-875636a8,2019-01-01,0,mas holdings,25,48.0,24.0,1,4.0,1,36.0,0,1
2,harsha-karunanayake-875636a8,2021-01-01,0,mas holdings,13,24.0,12.0,1,4.0,1,36.0,3,1
3,harsha-karunanayake-875636a8,2022-01-01,0,mas holdings,19,12.0,0.0,1,4.0,1,36.0,0,1
4,harsha-karunanayake-875636a8,NaT,0,hsbc,12,,,0,4.0,0,143.0,4,2
5,sagara-lakmal-3b5634103,2018-01-01,3,mas holdings,18,60.0,48.0,1,4.0,1,36.0,-1,1
6,sagara-lakmal-3b5634103,2019-01-01,3,dialog axiata plc,21,48.0,24.0,0,2.0,1,30.0,1,2
7,sagara-lakmal-3b5634103,2021-01-01,5,national development bank plc (ndb),20,24.0,0.0,0,2.0,0,44.0,2,3


In [356]:
new_selected_df['Start Time'] = pd.to_datetime(new_selected_df['Start Time'])
new_selected_df = new_selected_df.sort_values(by=['Profile Name', 'Start Time', 'Job Level'])

# Calculate the "Level Up" column based on job level changes
new_selected_df['Level Up'] = (new_selected_df.groupby('Profile Name')['Job Level']
            .diff()
            .fillna(0)
            .gt(0)
            .astype(int))

In [357]:
new_selected_df

Unnamed: 0,Profile Name,Start Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters,company_age_years,company_change,cumulative_company_number,Level Up
0,harsha-karunanayake-875636a8,2018-01-01,3,mas holdings,13,60.0,48.0,1,4.0,1,36.0,1,1,0
1,harsha-karunanayake-875636a8,2019-01-01,0,mas holdings,25,48.0,24.0,1,4.0,1,36.0,0,1,0
2,harsha-karunanayake-875636a8,2021-01-01,0,mas holdings,13,24.0,12.0,1,4.0,1,36.0,3,1,0
3,harsha-karunanayake-875636a8,2022-01-01,0,mas holdings,19,12.0,0.0,1,4.0,1,36.0,0,1,0
4,harsha-karunanayake-875636a8,NaT,0,hsbc,12,,,0,4.0,0,143.0,4,2,0
5,sagara-lakmal-3b5634103,2018-01-01,3,mas holdings,18,60.0,48.0,1,4.0,1,36.0,-1,1,0
6,sagara-lakmal-3b5634103,2019-01-01,3,dialog axiata plc,21,48.0,24.0,0,2.0,1,30.0,1,2,0
7,sagara-lakmal-3b5634103,2021-01-01,5,national development bank plc (ndb),20,24.0,0.0,0,2.0,0,44.0,2,3,1


In [358]:
new_selected_df['lateral_change'] = np.where(
    (new_selected_df['company_change'] != 0) &
    (new_selected_df['Level Up'] == 0),
    1,
    0
)

In [359]:
new_selected_df

Unnamed: 0,Profile Name,Start Time,Job Level,organisation,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters,company_age_years,company_change,cumulative_company_number,Level Up,lateral_change
0,harsha-karunanayake-875636a8,2018-01-01,3,mas holdings,13,60.0,48.0,1,4.0,1,36.0,1,1,0,1
1,harsha-karunanayake-875636a8,2019-01-01,0,mas holdings,25,48.0,24.0,1,4.0,1,36.0,0,1,0,0
2,harsha-karunanayake-875636a8,2021-01-01,0,mas holdings,13,24.0,12.0,1,4.0,1,36.0,3,1,0,1
3,harsha-karunanayake-875636a8,2022-01-01,0,mas holdings,19,12.0,0.0,1,4.0,1,36.0,0,1,0,0
4,harsha-karunanayake-875636a8,NaT,0,hsbc,12,,,0,4.0,0,143.0,4,2,0,1
5,sagara-lakmal-3b5634103,2018-01-01,3,mas holdings,18,60.0,48.0,1,4.0,1,36.0,-1,1,0,1
6,sagara-lakmal-3b5634103,2019-01-01,3,dialog axiata plc,21,48.0,24.0,0,2.0,1,30.0,1,2,0,1
7,sagara-lakmal-3b5634103,2021-01-01,5,national development bank plc (ndb),20,24.0,0.0,0,2.0,0,44.0,2,3,1,0


In [362]:
selected_columns = ['Profile Name' ,'Job Level', 'Duration', 'start_recency_months', 'end_recency_months', 'apparel_industry', 'company_size','Headquaters', 'company_age_years','company_change','cumulative_company_number','Level Up','lateral_change']
final_df = new_selected_df[selected_columns]

In [363]:
final_df

Unnamed: 0,Profile Name,Job Level,Duration,start_recency_months,end_recency_months,apparel_industry,company_size,Headquaters,company_age_years,company_change,cumulative_company_number,Level Up,lateral_change
0,harsha-karunanayake-875636a8,3,13,60.0,48.0,1,4.0,1,36.0,1,1,0,1
1,harsha-karunanayake-875636a8,0,25,48.0,24.0,1,4.0,1,36.0,0,1,0,0
2,harsha-karunanayake-875636a8,0,13,24.0,12.0,1,4.0,1,36.0,3,1,0,1
3,harsha-karunanayake-875636a8,0,19,12.0,0.0,1,4.0,1,36.0,0,1,0,0
4,harsha-karunanayake-875636a8,0,12,,,0,4.0,0,143.0,4,2,0,1
5,sagara-lakmal-3b5634103,3,18,60.0,48.0,1,4.0,1,36.0,-1,1,0,1
6,sagara-lakmal-3b5634103,3,21,48.0,24.0,0,2.0,1,30.0,1,2,0,1
7,sagara-lakmal-3b5634103,5,20,24.0,0.0,0,2.0,0,44.0,2,3,1,0
