In [70]:
import pandas as pd

file_path_jsonl = r'data\profiles_sagara_harsha.jsonl'

# Read the JSONL file
df = pd.read_json(file_path_jsonl, lines=True)
df

Unnamed: 0,profile,name,experience,education
0,sagara-lakmal-3b5634103,Sagara Lakmal,[{'position': 'Manager - Data Analytics & Gove...,[{'organisation': 'University of Colombo Schoo...
1,harsha-karunanayake-875636a8,Harsha Karunanayake,[{'position': 'Process Improvement Consultant'...,[{'organisation': 'University of Kelaniya Sri ...


In [72]:
import ast

df['education'] = df['education'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
# df['education'] = df['education'].apply(ast.literal_eval)

# Create empty lists to store extracted data
profile_names = []
organization_profiles = []
organisation = []
course_details = []
education_description = []
start_times = []
end_times = []

# Iterate through the DataFrame and extract data
for idx, row in df.iterrows():
    profile = row['profile']
    education_data = row['education']
    
    for edu in education_data:
        profile_names.append(profile)
        organization_profiles.append(edu.get("organisation_profile", ""))
        organisation.append(edu.get("organisation", ""))
        course_details.append(edu.get("course_details", ""))
        education_description.append(edu.get("description", ""))
        start_times.append(edu.get("start_time", ""))
        end_times.append(edu.get("end_time", ""))

# Create a DataFrame from the extracted data
extracted_education_df = pd.DataFrame({
    'profile_name': profile_names,
    'organization_profile': organization_profiles,
    'organisation': organisation,
    'course_details': course_details,
    'description': education_description,
    'start_time': start_times,
    'end_time': end_times
})

extracted_education_df

Unnamed: 0,profile_name,organization_profile,organisation,course_details,description,start_time,end_time
0,sagara-lakmal-3b5634103,https://www.linkedin.com/school/ucsc-lk/,University of Colombo School of Computing,Master of Business Analytics,,2021,present
1,sagara-lakmal-3b5634103,https://www.linkedin.com/school/university-of-...,University of Kelaniya Sri Lanka,"BSc in Management & Information Technology, sp...",Academic Achievements:,2013,2018
2,sagara-lakmal-3b5634103,https://www.linkedin.com/school/cima/,CIMA,Part Qualified Accounting and Business/Management,,2012,2013
3,sagara-lakmal-3b5634103,https://www.linkedin.com/school/anandacollege/,Ananda College - Colombo 10,GCE Advance Level Combined Mathematics,,2003,2011
4,harsha-karunanayake-875636a8,https://www.linkedin.com/school/university-of-...,University of Kelaniya Sri Lanka,Bsc in Management and Information Technology -...,,2013,2017
5,harsha-karunanayake-875636a8,https://www.linkedin.com/school/dharmarajacoll...,Dharmaraja College - Kandy,Advance level Mathematics Selected for Univers...,,2008,2012


# Get course level

In [73]:
# Load course_mapping_df
course_mapping_df = pd.read_excel("course_mapping.xlsx")

# Convert the relevant columns to lowercase for case-insensitive matching
course_mapping_df['Course'] = course_mapping_df['Course'].str.lower()
extracted_education_df['course_details'] = extracted_education_df['course_details'].str.lower()

# Define a function to map the course details to the corresponding course level
def get_course_level(course_details):
    for course in course_mapping_df['Course']:
        if course in course_details:
            level = course_mapping_df.loc[course_mapping_df['Course'] == course, 'Level'].values
            if len(level) > 0:
                return level[0]

    # If no match is found, return 0
    return 1

# Apply the get_course_level function and create a new "Course_Level" column
extracted_education_df['course_level'] = extracted_education_df['course_details'].apply(get_course_level)

# Print the updated extracted_education_df
extracted_education_df


Unnamed: 0,profile_name,organization_profile,organisation,course_details,description,start_time,end_time,course_level
0,sagara-lakmal-3b5634103,https://www.linkedin.com/school/ucsc-lk/,University of Colombo School of Computing,master of business analytics,,2021,present,5
1,sagara-lakmal-3b5634103,https://www.linkedin.com/school/university-of-...,University of Kelaniya Sri Lanka,"bsc in management & information technology, sp...",Academic Achievements:,2013,2018,4
2,sagara-lakmal-3b5634103,https://www.linkedin.com/school/cima/,CIMA,part qualified accounting and business/management,,2012,2013,1
3,sagara-lakmal-3b5634103,https://www.linkedin.com/school/anandacollege/,Ananda College - Colombo 10,gce advance level combined mathematics,,2003,2011,2
4,harsha-karunanayake-875636a8,https://www.linkedin.com/school/university-of-...,University of Kelaniya Sri Lanka,bsc in management and information technology -...,,2013,2017,4
5,harsha-karunanayake-875636a8,https://www.linkedin.com/school/dharmarajacoll...,Dharmaraja College - Kandy,advance level mathematics selected for univers...,,2008,2012,2


# Merge profile dataframe with institute dataframe

In [74]:
# Load institutes data to dataframe
file_path2 = r'data\institutes_sagara_harsha.jsonl'
df_school = pd.read_json(file_path2,lines=True)
df_school

Unnamed: 0,name,summary,url,industry,size,headquaters,type,founded,specialties
0,University of Kelaniya Sri Lanka,,https://www.linkedin.com/school/university-of-...,Higher Education,"1,001-5,000 employees","Kelaniya, Western",Educational,1875,"research, academic degrees, and professional d..."
1,University of Colombo School of Computing,Pioneer of ICT education in Sri Lanka,https://www.linkedin.com/school/ucsc-lk/,Higher Education,51-200 employees,"Colombo 7, Western",Educational,2002,"Undergraduate Degrees, Postgraduate Degrees, E..."
2,Ananda College,,https://www.linkedin.com/school/anandacollege/,Higher Education,51-200 employees,"Colombo 10, Western province",Educational,1886,
3,Dharmaraja College - Kandy,"""අත්තාහි අත්තනෝ නාතෝ"" - ""Attahi Attano Natho"" ...",https://www.linkedin.com/school/dharmarajacoll...,Education Management,201-500 employees,"Kandy, Central",Educational,1987,
4,CIMA,Helping people and businesses to succeed.,https://www.linkedin.com/school/cima/,Accounting,201-500 employees,"One South Place, London",Nonprofit,1919,Professional accounting qualification


In [75]:
# Convert columns to lowercase for merging
# df_school['name'] = df_school['name'].str.lower()

# Merge the DataFrames 
merged_df = pd.merge(extracted_education_df,df_school, left_on='organization_profile', right_on='url', how='left')
merged_df

Unnamed: 0,profile_name,organization_profile,organisation,course_details,description,start_time,end_time,course_level,name,summary,url,industry,size,headquaters,type,founded,specialties
0,sagara-lakmal-3b5634103,https://www.linkedin.com/school/ucsc-lk/,University of Colombo School of Computing,master of business analytics,,2021,present,5,University of Colombo School of Computing,Pioneer of ICT education in Sri Lanka,https://www.linkedin.com/school/ucsc-lk/,Higher Education,51-200 employees,"Colombo 7, Western",Educational,2002,"Undergraduate Degrees, Postgraduate Degrees, E..."
1,sagara-lakmal-3b5634103,https://www.linkedin.com/school/university-of-...,University of Kelaniya Sri Lanka,"bsc in management & information technology, sp...",Academic Achievements:,2013,2018,4,University of Kelaniya Sri Lanka,,https://www.linkedin.com/school/university-of-...,Higher Education,"1,001-5,000 employees","Kelaniya, Western",Educational,1875,"research, academic degrees, and professional d..."
2,sagara-lakmal-3b5634103,https://www.linkedin.com/school/cima/,CIMA,part qualified accounting and business/management,,2012,2013,1,CIMA,Helping people and businesses to succeed.,https://www.linkedin.com/school/cima/,Accounting,201-500 employees,"One South Place, London",Nonprofit,1919,Professional accounting qualification
3,sagara-lakmal-3b5634103,https://www.linkedin.com/school/anandacollege/,Ananda College - Colombo 10,gce advance level combined mathematics,,2003,2011,2,Ananda College,,https://www.linkedin.com/school/anandacollege/,Higher Education,51-200 employees,"Colombo 10, Western province",Educational,1886,
4,harsha-karunanayake-875636a8,https://www.linkedin.com/school/university-of-...,University of Kelaniya Sri Lanka,bsc in management and information technology -...,,2013,2017,4,University of Kelaniya Sri Lanka,,https://www.linkedin.com/school/university-of-...,Higher Education,"1,001-5,000 employees","Kelaniya, Western",Educational,1875,"research, academic degrees, and professional d..."
5,harsha-karunanayake-875636a8,https://www.linkedin.com/school/dharmarajacoll...,Dharmaraja College - Kandy,advance level mathematics selected for univers...,,2008,2012,2,Dharmaraja College - Kandy,"""අත්තාහි අත්තනෝ නාතෝ"" - ""Attahi Attano Natho"" ...",https://www.linkedin.com/school/dharmarajacoll...,Education Management,201-500 employees,"Kandy, Central",Educational,1987,


In [76]:
# Select only necessary columns for further opereations
selected_columns_school = ['profile_name', 'organisation', 'start_time','end_time', 'course_level', 'founded','size','headquaters']

new_df = merged_df[selected_columns_school]
new_df

Unnamed: 0,profile_name,organisation,start_time,end_time,course_level,founded,size,headquaters
0,sagara-lakmal-3b5634103,University of Colombo School of Computing,2021,present,5,2002,51-200 employees,"Colombo 7, Western"
1,sagara-lakmal-3b5634103,University of Kelaniya Sri Lanka,2013,2018,4,1875,"1,001-5,000 employees","Kelaniya, Western"
2,sagara-lakmal-3b5634103,CIMA,2012,2013,1,1919,201-500 employees,"One South Place, London"
3,sagara-lakmal-3b5634103,Ananda College - Colombo 10,2003,2011,2,1886,51-200 employees,"Colombo 10, Western province"
4,harsha-karunanayake-875636a8,University of Kelaniya Sri Lanka,2013,2017,4,1875,"1,001-5,000 employees","Kelaniya, Western"
5,harsha-karunanayake-875636a8,Dharmaraja College - Kandy,2008,2012,2,1987,201-500 employees,"Kandy, Central"


# Calculate start and end recency

In [77]:
from datetime import datetime

current_date = datetime(2023, 1, 31)

# Replace 'present' with the current year
new_df['end_time'] = new_df['end_time'].replace('present', current_date.year)

# Replace empty strings and NaN values with zeros
new_df['start_time'] = new_df['start_time'].replace('', '0')
new_df['end_time'] = new_df['end_time'].replace('', '0')

# Encoding the columns as per your specifications
current_year = 2023
new_df['start_recency_months'] = new_df['start_time'].astype(int).apply(lambda x: (current_year - x) * 12)
new_df['end_recency_months'] = new_df['end_time'].astype(int).apply(lambda x: (current_year - x) * 12)
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['end_time'] = new_df['end_time'].replace('present', current_date.year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['start_time'] = new_df['start_time'].replace('', '0')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['end_time'] = new_df['end_time'].replace('', '0')
A value i

Unnamed: 0,profile_name,organisation,start_time,end_time,course_level,founded,size,headquaters,start_recency_months,end_recency_months
0,sagara-lakmal-3b5634103,University of Colombo School of Computing,2021,2023,5,2002,51-200 employees,"Colombo 7, Western",24,0
1,sagara-lakmal-3b5634103,University of Kelaniya Sri Lanka,2013,2018,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,60
2,sagara-lakmal-3b5634103,CIMA,2012,2013,1,1919,201-500 employees,"One South Place, London",132,120
3,sagara-lakmal-3b5634103,Ananda College - Colombo 10,2003,2011,2,1886,51-200 employees,"Colombo 10, Western province",240,144
4,harsha-karunanayake-875636a8,University of Kelaniya Sri Lanka,2013,2017,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,72
5,harsha-karunanayake-875636a8,Dharmaraja College - Kandy,2008,2012,2,1987,201-500 employees,"Kandy, Central",180,132


# Get company size

In [78]:
size_mapping = {
    '51-200 employees': 1,
    '201-500 employees': 2,
    '501-1000 employees': 3,
    '1,001-5,000 employees': 4,
    '5,001-10,000 employees': 5,
    '10,001+ employees': 6
}

# Replace NaN values with a default value, in this case, 0
new_df['size'].fillna('0', inplace=True)
new_df['school_size'] = new_df['size'].map(size_mapping)
new_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['size'].fillna('0', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['school_size'] = new_df['size'].map(size_mapping)


Unnamed: 0,profile_name,organisation,start_time,end_time,course_level,founded,size,headquaters,start_recency_months,end_recency_months,school_size
0,sagara-lakmal-3b5634103,University of Colombo School of Computing,2021,2023,5,2002,51-200 employees,"Colombo 7, Western",24,0,1
1,sagara-lakmal-3b5634103,University of Kelaniya Sri Lanka,2013,2018,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,60,4
2,sagara-lakmal-3b5634103,CIMA,2012,2013,1,1919,201-500 employees,"One South Place, London",132,120,2
3,sagara-lakmal-3b5634103,Ananda College - Colombo 10,2003,2011,2,1886,51-200 employees,"Colombo 10, Western province",240,144,1
4,harsha-karunanayake-875636a8,University of Kelaniya Sri Lanka,2013,2017,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,72,4
5,harsha-karunanayake-875636a8,Dharmaraja College - Kandy,2008,2012,2,1987,201-500 employees,"Kandy, Central",180,132,2


# Calculate institute age

In [79]:
new_df['school_age_years'] = 2023 - new_df['founded']
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['school_age_years'] = 2023 - new_df['founded']


Unnamed: 0,profile_name,organisation,start_time,end_time,course_level,founded,size,headquaters,start_recency_months,end_recency_months,school_size,school_age_years
0,sagara-lakmal-3b5634103,University of Colombo School of Computing,2021,2023,5,2002,51-200 employees,"Colombo 7, Western",24,0,1,21
1,sagara-lakmal-3b5634103,University of Kelaniya Sri Lanka,2013,2018,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,60,4,148
2,sagara-lakmal-3b5634103,CIMA,2012,2013,1,1919,201-500 employees,"One South Place, London",132,120,2,104
3,sagara-lakmal-3b5634103,Ananda College - Colombo 10,2003,2011,2,1886,51-200 employees,"Colombo 10, Western province",240,144,1,137
4,harsha-karunanayake-875636a8,University of Kelaniya Sri Lanka,2013,2017,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,72,4,148
5,harsha-karunanayake-875636a8,Dharmaraja College - Kandy,2008,2012,2,1987,201-500 employees,"Kandy, Central",180,132,2,36


# Get headquaters location

In [80]:
# Cleaning headquaters (Removing numbers and strings after comma).
def clean_headquarters(value):
    if pd.notna(value):
        parts = value.split(',')[0]
        return ''.join(filter(str.isalpha, parts))
    else:
        return value
    
def clean_headquarters_part2(value):
    if pd.notna(value):
        parts = value.split(',')[1]
        return ''.join(filter(str.isalpha, parts))
    else:
        return value

new_df['headquaters_part1'] = new_df['headquaters'].apply(clean_headquarters)
new_df['headquaters_part2'] = new_df['headquaters'].apply(clean_headquarters_part2)
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters_part1'] = new_df['headquaters'].apply(clean_headquarters)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters_part2'] = new_df['headquaters'].apply(clean_headquarters_part2)


Unnamed: 0,profile_name,organisation,start_time,end_time,course_level,founded,size,headquaters,start_recency_months,end_recency_months,school_size,school_age_years,headquaters_part1,headquaters_part2
0,sagara-lakmal-3b5634103,University of Colombo School of Computing,2021,2023,5,2002,51-200 employees,"Colombo 7, Western",24,0,1,21,Colombo,Western
1,sagara-lakmal-3b5634103,University of Kelaniya Sri Lanka,2013,2018,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,60,4,148,Kelaniya,Western
2,sagara-lakmal-3b5634103,CIMA,2012,2013,1,1919,201-500 employees,"One South Place, London",132,120,2,104,OneSouthPlace,London
3,sagara-lakmal-3b5634103,Ananda College - Colombo 10,2003,2011,2,1886,51-200 employees,"Colombo 10, Western province",240,144,1,137,Colombo,Westernprovince
4,harsha-karunanayake-875636a8,University of Kelaniya Sri Lanka,2013,2017,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,72,4,148,Kelaniya,Western
5,harsha-karunanayake-875636a8,Dharmaraja College - Kandy,2008,2012,2,1987,201-500 employees,"Kandy, Central",180,132,2,36,Kandy,Central


In [56]:
#country mapping
from geopy.geocoders import Nominatim

def get_country_from_city(city_name):
    geolocator = Nominatim(user_agent="city-to-country")
    location = geolocator.geocode(city_name)

    if location:
        country_name = location.address.split(",")[-1].strip()
        if country_name == "ශ්‍රී ලංකාව இலங்கை":
            country_name = "Sri Lanka"
        elif country_name == "Italia":
            country_name = "Unknown"
        return country_name
    else:
        return 'unidentified'

# for index, row in df.iterrows():
#     city = row['headquaters']  
#     country = get_country_from_city(city)

#     if country:
#         print(f" {country}")
#     else:
#         print(f"0")


In [81]:
new_df['headquaters_part1'] = new_df['headquaters_part1'].apply(lambda city: get_country_from_city(city))
new_df['headquaters_part2'] = new_df['headquaters_part2'].apply(lambda city: get_country_from_city(city))
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters_part1'] = new_df['headquaters_part1'].apply(lambda city: get_country_from_city(city))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters_part2'] = new_df['headquaters_part2'].apply(lambda city: get_country_from_city(city))


Unnamed: 0,profile_name,organisation,start_time,end_time,course_level,founded,size,headquaters,start_recency_months,end_recency_months,school_size,school_age_years,headquaters_part1,headquaters_part2
0,sagara-lakmal-3b5634103,University of Colombo School of Computing,2021,2023,5,2002,51-200 employees,"Colombo 7, Western",24,0,1,21,Sri Lanka,Kenya
1,sagara-lakmal-3b5634103,University of Kelaniya Sri Lanka,2013,2018,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,60,4,148,Sri Lanka,Kenya
2,sagara-lakmal-3b5634103,CIMA,2012,2013,1,1919,201-500 employees,"One South Place, London",132,120,2,104,unidentified,United Kingdom
3,sagara-lakmal-3b5634103,Ananda College - Colombo 10,2003,2011,2,1886,51-200 employees,"Colombo 10, Western province",240,144,1,137,Sri Lanka,unidentified
4,harsha-karunanayake-875636a8,University of Kelaniya Sri Lanka,2013,2017,4,1875,"1,001-5,000 employees","Kelaniya, Western",120,72,4,148,Sri Lanka,Kenya
5,harsha-karunanayake-875636a8,Dharmaraja College - Kandy,2008,2012,2,1987,201-500 employees,"Kandy, Central",180,132,2,36,Sri Lanka,Zambia


In [84]:
# Encoding sri_lankan
def is_in_sri_lanka(row):
    if 'Sri Lanka' in row['headquaters_part1'] or 'Sri Lanka' in row['headquaters_part2']:
        return 1
    else:
        return 0

new_df['headquaters'] = new_df.apply(is_in_sri_lanka, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['headquaters'] = new_df.apply(is_in_sri_lanka, axis=1)


In [93]:
new_df['duration'] = (new_df['end_time'].astype(int) - new_df['start_time'].astype(int)) * 12
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['duration'] = (new_df['end_time'].astype(int) - new_df['start_time'].astype(int)) * 12


Unnamed: 0,profile_name,organisation,start_time,end_time,course_level,founded,size,headquaters,start_recency_months,end_recency_months,school_size,school_age_years,headquaters_part1,headquaters_part2,duration
0,sagara-lakmal-3b5634103,University of Colombo School of Computing,2021,2023,5,2002,51-200 employees,1,24,0,1,21,Sri Lanka,Kenya,24
1,sagara-lakmal-3b5634103,University of Kelaniya Sri Lanka,2013,2018,4,1875,"1,001-5,000 employees",1,120,60,4,148,Sri Lanka,Kenya,60
2,sagara-lakmal-3b5634103,CIMA,2012,2013,1,1919,201-500 employees,0,132,120,2,104,unidentified,United Kingdom,12
3,sagara-lakmal-3b5634103,Ananda College - Colombo 10,2003,2011,2,1886,51-200 employees,1,240,144,1,137,Sri Lanka,unidentified,96
4,harsha-karunanayake-875636a8,University of Kelaniya Sri Lanka,2013,2017,4,1875,"1,001-5,000 employees",1,120,72,4,148,Sri Lanka,Kenya,48
5,harsha-karunanayake-875636a8,Dharmaraja College - Kandy,2008,2012,2,1987,201-500 employees,1,180,132,2,36,Sri Lanka,Zambia,48


In [95]:
# Dropping the original columns that have been encoded
coulumn_selected_df = new_df.drop(['organisation', 'start_time', 'end_time', 'founded', 'size', 'headquaters_part1', 'headquaters_part2'], axis=1)
coulumn_selected_df

Unnamed: 0,profile_name,course_level,headquaters,start_recency_months,end_recency_months,school_size,school_age_years,duration
0,sagara-lakmal-3b5634103,5,1,24,0,1,21,24
1,sagara-lakmal-3b5634103,4,1,120,60,4,148,60
2,sagara-lakmal-3b5634103,1,0,132,120,2,104,12
3,sagara-lakmal-3b5634103,2,1,240,144,1,137,96
4,harsha-karunanayake-875636a8,4,1,120,72,4,148,48
5,harsha-karunanayake-875636a8,2,1,180,132,2,36,48


In [96]:
desired_order = ['profile_name', 'course_level', 'duration', 
       'start_recency_months', 'end_recency_months', 'school_size', 'school_age_years', 'headquaters']

# Reorder the columns in the DataFrame
final_df = coulumn_selected_df[desired_order]
final_df

Unnamed: 0,profile_name,course_level,duration,start_recency_months,end_recency_months,school_size,school_age_years,headquaters
0,sagara-lakmal-3b5634103,5,24,24,0,1,21,1
1,sagara-lakmal-3b5634103,4,60,120,60,4,148,1
2,sagara-lakmal-3b5634103,1,12,132,120,2,104,0
3,sagara-lakmal-3b5634103,2,96,240,144,1,137,1
4,harsha-karunanayake-875636a8,4,48,120,72,4,148,1
5,harsha-karunanayake-875636a8,2,48,180,132,2,36,1


In [97]:
final_df.to_csv('final_school_mapping.csv', index=False)