In [6]:
import pandas as pd
import datetime
from datetime import datetime

In [3]:
file_path_json = 'data\scraped_people_data\people_2023-11-16T08-24-55.json'

# Read the JSONL file
df = pd.read_json(file_path_json)
df

Unnamed: 0,profile,url,name,experience,education
0,sohan-deshantha-889b3127,https://www.linkedin.com/in/sohan-deshantha-88...,Sohan Deshantha,[{'position': 'Deputy General Manager - P2P & ...,[{'organisation': 'Cardiff Metropolitan Univer...


In [4]:
# df['experience'] = df['experience'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

# # Create empty lists to store extracted data
# profile_names = []
# organization_profiles = []
# positions = []
# organisation =[]
# start_times = []
# end_times = []
# durations = []

# for idx, row in df.iterrows():
#     name = row['profile']  # Get the 'name' 
#     experience_data = row['experience']
    
#     for exp in experience_data:
#         profile_names.append(name)
#         organization_profiles.append(exp.get("organisation_profile", ""))
#         positions.append(exp.get("position", ""))
#         organisation.append(exp.get("organisation", ""))
#         start_times.append(exp.get("start_time", ""))

# extracted_data_df = pd.DataFrame({
#     'profile_name': profile_names,
#     'organisation':organisation,
#     'position': positions,
#     'start_time': start_times,
# })
# extracted_data_df

def normalize_data(df):
    df['experience'] = df['experience'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    
    # Create empty lists to store extracted data
    profile_names = []
    organisation =[]
    start_times = []

    for idx, row in df.iterrows():
        name = row['profile']  # Get the 'name' 
        experience_data = row['experience']

        for exp in experience_data:
            profile_names.append(name)
            organisation.append(exp.get("organisation", ""))
            start_times.append(exp.get("start_time", ""))

    df = pd.DataFrame({
        'profile_name': profile_names,
        'organisation':organisation,
        'start_time': start_times,
    })
    
    return df

df = normalize_data(df)
df

Unnamed: 0,profile_name,organisation,start_time
0,sohan-deshantha-889b3127,MAS Legato,Apr 2023
1,sohan-deshantha-889b3127,MAS Holdings,Aug 2021
2,sohan-deshantha-889b3127,MAS Holdings,Jan 2020
3,sohan-deshantha-889b3127,MAS Holdings,Apr 2018
4,sohan-deshantha-889b3127,MAS Holdings,Apr 2006
5,sohan-deshantha-889b3127,MAS Holdings,Aug 2004


In [5]:
def transform_organisation(df):
    # Convert 'organisation' column to lowercase
    df['organisation'] = df['organisation'].str.lower()

    # Apply the condition and update the 'organisation' column
    mask = df['organisation'].str[:3] == 'mas'
    df.loc[mask, 'organisation'] = 'mas holdings'

    # List of child organizations and the parent organization
    child_organizations = ['mas intimates', 'mas kreeda', 'mas active', 'linea aqua', 'mas Linea Aqua', 'bodyline', 'mas legato',
                           'silueta - technologies by mas', 'twinery - innovations by mas', 'noyon lanka pvt ltd', 'mas matrix',
                           'hellmann mas supply chain', 'silueta', 'twinery', 'noyon',
                          ]

    parent_organization = 'mas holdings'

    # Update the 'organisation' column for child organizations
    for child_org in child_organizations:
        df.loc[df['organisation'].str.contains(child_org, case=False, na=False), 'organisation'] = parent_organization

    return df

df = transform_organisation(df)
df

Unnamed: 0,profile_name,organisation,start_time
0,sohan-deshantha-889b3127,mas holdings,Apr 2023
1,sohan-deshantha-889b3127,mas holdings,Aug 2021
2,sohan-deshantha-889b3127,mas holdings,Jan 2020
3,sohan-deshantha-889b3127,mas holdings,Apr 2018
4,sohan-deshantha-889b3127,mas holdings,Apr 2006
5,sohan-deshantha-889b3127,mas holdings,Aug 2004


In [7]:
def convert_month_year_to_date_column(df):
    def convert_month_year_to_date(value):
        try:
            if len(value) == 4:
                # Assume it's in "yyyy" format
                date = pd.to_datetime(value, format='%Y')
            elif len(value) > 4:
                # Assume it's in "Mon yyyy" format
                date = pd.to_datetime(value, format='%b %Y')
            else:
                # If the length doesn't match either format, return None
                date = None
            return date
        except ValueError:
            return None

    # Convert the specified column
    df['start_time'] = df['start_time'].apply(convert_month_year_to_date)

    return df

df = convert_month_year_to_date_column(df)
df

Unnamed: 0,profile_name,organisation,start_time
0,sohan-deshantha-889b3127,mas holdings,2023-04-01
1,sohan-deshantha-889b3127,mas holdings,2021-08-01
2,sohan-deshantha-889b3127,mas holdings,2020-01-01
3,sohan-deshantha-889b3127,mas holdings,2018-04-01
4,sohan-deshantha-889b3127,mas holdings,2006-04-01
5,sohan-deshantha-889b3127,mas holdings,2004-08-01


In [8]:
def calculate_company_change(df):
    # Sort the dataframe vy profile and start time
    df.sort_values(by=['profile_name', 'start_time'], ascending=[False, False], inplace=True)

    # Reset indexes
    df = df.reset_index(drop=True)

    # Initialize an empty list to store the values for the new "company_change" column
    company_change = []

    # Initialize a variable to keep track of the current profile
    current_profile = None
    previous_organization = None

    # Iterate over the rows of the DataFrame in reverse order
    for index in reversed(df.index):
        row = df.loc[index]
        if row['profile_name'] != current_profile:
            # If the profile has changed, set the company_change value to -1
            company_change.insert(0, -1)
            current_profile = row['profile_name']
            previous_organization = row['organisation']
        else:
            # Check if the organization has changed compared to the next row
            if row['organisation'] != previous_organization:
                company_change.insert(0, 1)  # Organization changed
            else:
                if previous_organization is None:
                    company_change.insert(0, -1)  # First organization in profile
                else:
                    company_change.insert(0, 0)  # Organization did not change
            previous_organization = row['organisation']

    # Add the "company_change" column to the DataFrame
    df['company_change'] = company_change

    return df

df = calculate_company_change(df)
df

Unnamed: 0,profile_name,organisation,start_time,company_change
0,sohan-deshantha-889b3127,mas holdings,2023-04-01,0
1,sohan-deshantha-889b3127,mas holdings,2021-08-01,0
2,sohan-deshantha-889b3127,mas holdings,2020-01-01,0
3,sohan-deshantha-889b3127,mas holdings,2018-04-01,0
4,sohan-deshantha-889b3127,mas holdings,2006-04-01,0
5,sohan-deshantha-889b3127,mas holdings,2004-08-01,-1


In [9]:
def calculate_job_change(df):
    # Drop duplicates based on 'profile_name' and keep the first occurrence
    df = df.drop_duplicates(subset='profile_name', keep='first')

    # Calculate 'job_change' column
    df['job_change'] = ((df['start_time'] >= '2023-01-01') & (df['company_change'] == 1)).astype(int)

    return df

df = calculate_job_change(df)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['job_change'] = ((df['start_time'] >= '2023-01-01') & (df['company_change'] == 1)).astype(int)


Unnamed: 0,profile_name,organisation,start_time,company_change,job_change
0,sohan-deshantha-889b3127,mas holdings,2023-04-01,0,0


In [11]:
def calculate_job_change_horizon(df):
    # Calculate 'job_change_horizon' column
    df['job_change_horizon'] = df.apply(lambda row: ((row['start_time'] - datetime(2023, 1, 1)).days // 30) + 1 if row['job_change'] == 1 else 0, axis=1)

    df = df[['profile_name', 'job_change', 'job_change_horizon']]
    return df

df = calculate_job_change_horizon(df)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['job_change_horizon'] = df.apply(lambda row: ((row['start_time'] - datetime(2023, 1, 1)).days // 30) + 1 if row['job_change'] == 1 else 0, axis=1)


Unnamed: 0,profile_name,job_change,job_change_horizon
0,sohan-deshantha-889b3127,0,0


In [12]:
df['job_change'].value_counts()

job_change
0    1
Name: count, dtype: int64

In [34]:
df.to_csv('y_variable_for_clean.csv')