In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Set a display option for maximum column width in Pandas DataFrames
pd.set_option('max_colwidth', 100)

# Read data from a CSV file into a Pandas DataFrame
df = pd.read_csv('Table_622.csv')
df.head()

In [None]:
# Preprocess the 'Exp' (Experience) column by removing "Years" and taking the mean of experience ranges
df['Exp'] = df['Exp'].str.replace(' Years','')
a = df['Exp'][1].split('-')
(int(a[0])+int(a[1]))/2
df['Exp']=(df.Exp.str.split('-').str[0].astype('int64')+df.Exp.str.split('-').str[1].astype('int64'))/2
df['Exp']

# Create bins (intervals) of mean experience values and encode them
df['Exp_range']=pd.cut(df['Exp'], bins=[-1,0,3,5,8,12,21])
df[['Exp','Exp_range']]
df['Exp_range'].value_counts()

# Encode the above interval values of experience using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Exp_range'] = le.fit_transform(df['Exp_range'])
df['Exp_range'].value_counts()

In [None]:
# Define a function to categorize experience levels and apply it to create a new column
def Experience_level(val):
    if val==0:
        return 'Fresher'
    elif val==1:
        return 'Junior Level'
    elif val==2:
        return 'Intermediate Level'
    elif val==3:
        return 'Mid Senior Level'
    elif val==4:
        return 'Senior'
    elif val==5:
        return 'Executive'
df['Experience_level'] = df['Exp_range'].apply(Experience_level)
df.head()

In [None]:
# Drop unnecessary columns 'Exp_range' and 'Exp'
df.drop(['Exp_range','Exp'], axis=1, inplace=True)
df.head(1)

In [None]:
# Preprocess the 'employees_count' column by mapping it to mean values
temp = {
    "More than 1000 employees": 1000,
    "500 - 1000 employees": 750,
    "50 - 200 employees": 125,
    "200 - 500 employees": 350,
    "10 - 50 employees": 30,
    "0 - 10 employees": 5
}
df['employees_count'] = df['employees_count'].replace(temp).astype('int64')
df.head(1)

In [None]:
# Clean and preprocess the 'estab_year' column and calculate the age of the company
df['estab_year'] = df['estab_year'].replace('[^0-9]','',regex=True).astype('int64')
df.head(1)
df['AgeOfCompany'] = 2023 - df['estab_year']
df.sample(3)

In [None]:
# Rename columns for clarity
rename = {
  'name' : 'CompanyName',
  'estab_year' : 'Company_estab_year',
  'employees_count' : 'employees_count',
  'location': 'Location',
  'job_ids' :'JobID',
  'skills' :'Skills',
  'hr' : 'Hr_name',
  'Designation' : 'Job_Designation',
  'Involvement' : 'Involvement',
  'url' : 'Job_url',
  'Industry' : 'Industry',
  'Experience_level' : 'Experience_level',
  'AgeOfCompany' : 'AgeOfCompany'
}
df.rename(columns = rename, inplace=True)
df.columns.to_list()

In [None]:
# Reorder columns in the DataFrame
reordering = ['JobID', 'Job_Designation', 'Involvement', 'Experience_level', 'Hr_name', 'Skills', 'CompanyName', 'Company_estab_year', 'AgeOfCompany',
              'Industry', 'Location', 'employees_count', 'Job_url']
df = df[reordering]
df.head(1)
df.shape
df['employees_count'].value_counts()

In [None]:
# Create scatter plots and box plots to visualize data distribution and outliers
plt.scatter(df.employees_count, df.AgeOfCompany)
plt.xlabel('Count of Employees')
plt.ylabel('Age of Company')

for i in ['employees_count', 'AgeOfCompany']:
    sns.boxplot(x=df[i])
    plt.show()

# Remove extreme outliers from the 'AgeOfCompany' column
df = df[df['AgeOfCompany'] <= 50]
plt.scatter(df.employees_count, df.AgeOfCompany)
plt.xlabel('Count of Employees')
plt.ylabel('Age of Company')
df.head(1)

In [None]:
# Save the ordered DataFrame to a CSV file
df.to_csv('Merged_Data.csv', index=False)