In [147]:
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
import glob
import string
import unicodedata
import re


In [148]:
os.path.dirname(sys.executable)

'c:\\Users\\veena\\anaconda3'

### Reading in the datasets

In [149]:

emp_count = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\employee_counts.csv")
comp_spec = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\company_specialities.csv")
comp_ind = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\company_industries.csv")
comp = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\companies.csv")
job_skill = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\job_skills.csv")
job_ind = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\job_industries.csv")
benefits = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\benefits.csv")
job_post = pd.read_csv("D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Input files\\job_postings.csv")


### Function for cleaning the description column by removing non-ASCII values, new line characters and custom regex to remove non alphanumeric ASCII values

In [150]:
def clean_non_ascii_new_line_string (string): 
    normalized_str = unicodedata.normalize('NFKD', string)
    encoded_str = normalized_str.encode('ascii', 'ignore')
    decoded_str = encoded_str.decode('ascii')
    decoded_str = decoded_str.replace('\\n', ' ')
    decoded_str = re.sub(r'[^\w\s]', '', decoded_str)
    return decoded_str

### Function for Null value imputation for entire dataframe

In [151]:
def fill_null_values_df(df):
    print("Count of Null Values in DataFrame:")
    print(df.isnull().sum())
    
    for column_name, column_dtype in df.dtypes.iteritems():
        print(f"Column '{column_name}' has data type: {column_dtype}")
        
    for column in df.columns:
        if df[column].isnull().any():
            print(f"Column: {column}")
            if pd.api.types.is_numeric_dtype(df[column]):
                print(f"Mean: {df[column].mean()}")
                print(f"Median: {df[column].median()}")
                print(f"Mode: {df[column].mode().iloc[0]}")
                choice = input(f"Fill null values in {column} with mean, median, mode, or not available? (Mean/Median/Mode/Not Available): ").strip().lower()
                if choice == 'm' or choice == 'mean':
                    df[column].fillna(df[column].mean(), inplace=True)
                elif choice == 'med' or choice == 'median':
                    df[column].fillna(df[column].median(), inplace=True)
                elif choice == 'mode':
                    mode_val = df[column].mode().iloc[0]
                    df[column].fillna(mode_val, inplace=True)
                else:
                    print(f"Filling {column} with 'Not Available'")
                    df[column].fillna("Not Available", inplace=True)
            else:
                print(f"Filling {column} with 'Not Available'")
                df[column].fillna("Not Available", inplace=True)
    return df


### Function for Null & NaN value imputation for a specific dataframe column with a specific imputation choice

In [152]:
def fill_null_values_column(df, column_name, imputation_type):
    try: 
        if imputation_type == 'm' or imputation_type == 'mean':
            df[column_name].fillna(df[column_name].mean(), inplace=True)
            df.replace(np.nan, '', regex=True)
        elif imputation_type == 'med' or imputation_type == 'median':
            df[column_name].fillna(df[column_name].median(), inplace=True)
        elif imputation_type == 'mode':
            mode_val = df[column_name].mode().iloc[0]
            df[column_name].fillna(mode_val, inplace=True)
        elif imputation_type == 'na':
            print(f"Filling {column_name} with 'Not Available'")
            df[column_name].fillna("Not Available", inplace=True)
    except Exception:
        print('Wrong input')
    return df

### Function for column duplication counts

In [153]:
def count_duplicates_per_column(df):
    duplicate_counts = df.apply(lambda col: col.duplicated().sum())
    return duplicate_counts


### Function to count number of zeroes in a column

In [154]:
def count_zeros_in_columns(dataframe):
    zero_counts = dataframe.eq(0).sum()
    for column, count in zero_counts.items():
        print(f"{column} has {count} zero values")

### Function to change column to string

In [155]:
def convert_object_columns_to_string(dataframe):
    # Iterate through the columns of the DataFrame
    for column in dataframe.columns:
        if dataframe[column].dtype == "object":
            # Convert the "object" column to string
            dataframe[column] = dataframe[column].astype(str)
    
    return dataframe

# Companies dataset exploration

In [156]:

comp.head()

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url
0,1009,IBM,"At IBM, we do more than work. We create. We cr...",7.0,NY,US,"Armonk, New York",10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,0,US,Chicago,0,-,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,"GE Power, part of GE Vernova, is a world energ...",7.0,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,We’re a cloud technology company that provides...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle


In [157]:
comp.dtypes

company_id        int64
name             object
description      object
company_size    float64
state            object
country          object
city             object
zip_code         object
address          object
url              object
dtype: object

#### Manual normalization to get the state to 1NF (first normal form)

In [158]:
comp['state'] = comp['state'].replace('sd', 'south dakota')
comp['state'] = comp['state'].replace(['kansas','ks'], 'kansas')
comp['state'] = comp['state'].replace(['ne','nebraska'], 'nebraska')
comp['state'] = comp['state'].replace(['iowa','ia'], 'iowa')
comp['state'] = comp['state'].replace(['new south wales','nsw'], 'new-south-wales')
comp['state'] = comp['state'].replace(['florence','fi'], 'florence')
comp['state'] = comp['state'].replace(['alabama','al'], 'alabama')
comp['state'] = comp['state'].replace(['basel','bs','basel-country'], 'basel')
comp['state'] = comp['state'].replace('nuevo león', 'nuevo leon')
comp['state'] = comp['state'].replace(['new hampshire','hampshire'], 'hampshire')
comp['state'] = comp['state'].replace(['louisiana','la'], 'louisiana')
comp['state'] = comp['state'].replace(['tamilnadu','tamil nadu'], 'tamilnadu')
comp['state'] = comp['state'].replace(['zug','zg'], 'zug')
comp['state'] = comp['state'].replace(['hertfordshire','herts'], 'hertfordshire')
comp['state'] = comp['state'].replace(['sussex','east sussex'], 'sussex')               
comp['state'] = comp['state'].replace(['Alaska','ak'], 'alaska')
comp['state'] = comp['state'].replace(['vermont','vt'], 'vermont')                     
comp['state'] = comp['state'].replace(['hi','hawaii'], 'hawaii')
comp['state'] = comp['state'].replace(['bc','barcelona'], 'barcelona')
comp['state'] = comp['state'].replace(['ohio','oh'], 'ohio')
comp['state'] = comp['state'].replace(['north rhine-westphalia','north-rhine-westphalia','nordrhein-westfalen','nrw'], 'north-rhine-westphalia')
comp['state'] = comp['state'].replace(['or','oregon'], 'oregon')
comp['state'] = comp['state'].replace('Córdoba', 'Cordoba')
comp['state'] = comp['state'].replace(['il','Illinois','illinos'], 'illinois')
comp['state'] = comp['state'].replace(['île-de-france','ile-de-france','ile de france'], 'france')
comp['state'] = comp['state'].replace(['georgia','georgia (ga)','ga'], 'georgia')
comp['state'] = comp['state'].replace(['co','co.','colorado'], 'colorado')
comp['state'] = comp['state'].replace(['nc','north carolina'], 'north carolina')
comp['state'] = comp['state'].replace(['bw','baden-württemberg','colorado'], 'baden-wurttemberg')
comp['state'] = comp['state'].replace(['tx','texas'], 'texas')
comp['state'] = comp['state'].replace(['central london','greater london','london'], 'london')
comp['state'] = comp['state'].replace(['pa','pennsylvania'], 'pennsylvania')
comp['state'] = comp['state'].replace(['Massachusetts','ma'], 'Massachusetts')
comp['state'] = comp['state'].replace(['maharastra','maharashtra','mh'], 'Maharashtra')
comp['state'] = comp['state'].replace(['ontario','on'], 'ontario')
comp['state'] = comp['state'].replace(['virginia','va'], 'virginia')
comp['state'] = comp['state'].replace(['quebec','québec','qc'], 'quebec')
comp['state'] = comp['state'].replace(['michigan','mi'], 'michigan')
comp['state'] = comp['state'].replace(['mn','minnesota'], 'minnesota')
comp['state'] = comp['state'].replace(['ar','arkansas'], 'arkansas')
comp['state'] = comp['state'].replace(['ct','connecticut'], 'connecticut')
comp['state'] = comp['state'].replace(['cambs','cambridgeshire'], 'cambridgeshire')
comp['state'] = comp['state'].replace(['indiana','in'], 'indiana')
comp['state'] = comp['state'].replace(['up','uttar pradesh'], 'uttar pradesh')
comp['state'] = comp['state'].replace(['wisconsin','wi'], 'wisconsin')
comp['state'] = comp['state'].replace(['bc','british columbia'], 'british columbia')
comp['state'] = comp['state'].replace(['md','maryland'], 'maryland')
comp['state'] = comp['state'].replace(['arizona','az'], 'arizona')
comp['state'] = comp['state'].replace(['nh','noord-holland','zuid-holland','south holland','north holland'], 'holland')
comp['state'] = comp['state'].replace(['idaho','id'], 'idaho')
comp['state'] = comp['state'].replace(['washington','wa','district of columbia'], 'washington')
comp['state'] = comp['state'].replace(['fl','florida'], 'florida')
comp['state'] = comp['state'].replace(['ky','kentucky'], 'kentucky')
comp['state'] = comp['state'].replace(['ri','rhode island'], 'rhode island')
comp['state'] = comp['state'].replace(['mt','montana'], 'montana')
comp['state'] = comp['state'].replace(['fl','florida'], 'florida')
comp['state'] = comp['state'].replace(['tennessee','tn'], 'tennessee')
comp['state'] = comp['state'].replace(['mo','missouri'], 'missouri')
comp['state'] = comp['state'].replace(['zürich','zurich'], 'zurich')
comp['state'] = comp['state'].replace(['utah','ut'], 'utah')
comp['state'] = comp['state'].replace(['guangdong','Guangdong'], 'guangdong')
comp['state'] = comp['state'].replace(['me','maine'], 'maine')
comp['state'] = comp['state'].replace(['haryana','harayana'], 'haryana')
comp['state'] = comp['state'].replace(['skane','skane county'], 'skane')
comp['state'] = comp['state'].replace(['community of madrid','madrid'], 'madrid')
comp['state'] = comp['state'].replace(['mississippi','ms'], 'mississippi')
comp['state'] = comp['state'].replace(['saskatchewan','sk'], 'saskatchewan')
comp['state'] = comp['state'].replace(['wyoming','wy'], 'wyoming')
comp['state'] = comp['state'].replace(['united states','usa'], 'united states')
comp['state'] = comp['state'].replace(['anywhere','not available'], 'not available')
comp['state'] = comp['state'].replace(['geneva','ch','switzerland'], 'switzerland')
comp['state'] = comp['state'].replace(['new mexico','nm'], 'new-mexico')
comp['state'] = comp['state'].replace(['oklahoma','ok'], 'oklahoma')
comp['state'] = comp['state'].replace(['qld','queensland'], 'queensland')
comp['state'] = comp['state'].replace(['alberta','ab'], 'alberta')
comp['state'] = comp['state'].replace(['nevada','nv'], 'nevada')
comp['state'] = comp['state'].replace(['sc','south carolina'], 'south Carolina')
comp['state'] = comp['state'].replace(['new jersey', 'newjersey', 'nj', 'n.j.'], 'new jersey')
comp['state'] = comp['state'].replace(['ny', 'ny - new york', 'new york', 'newyork'], 'new york')
comp['state'] = comp['state'].replace(['dc', 'd.c.', 'washington d.c.', 'washington', 'wa'], 'washington')
comp['state'] = comp['state'].replace(['ca', 'ca.', 'calif.', 'ca - california', 'california'], 'california')
comp['state'] = comp['state'].replace('广东省', 'Guangdong')
comp['state'] = comp['state'].replace(['delaware', 'de', 'delaware (de)'], 'delaware')
comp['state'] = comp['state'].replace(['28801', '0', '01824', '94086'], 'Not Available')

### Cleaning all string columns to remove non-ASCII values

In [159]:
comp = convert_object_columns_to_string(comp)
comp['name'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['name']), axis=1)
comp['description'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['description']), axis=1)
comp['state'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['state']), axis=1)
comp['country'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['country']), axis=1)
comp['city'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['city']), axis=1)
comp['zip_code'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['zip_code']), axis=1)
comp['address'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['address']), axis=1)
#comp['url'] = comp.apply(lambda row: clean_non_ascii_new_line_string(row['url']), axis=1)


For all character columns, we will impute with 'Not Available' 

In [160]:
fill_null_values_column(comp, 'name', 'na') 
fill_null_values_column(comp, 'description', 'na')
fill_null_values_column(comp, 'state', 'na')
fill_null_values_column(comp, 'country', 'na')
fill_null_values_column(comp, 'city', 'na')
fill_null_values_column(comp, 'zip_code', 'na')
fill_null_values_column(comp, 'address', 'na')
fill_null_values_column(comp, 'url', 'na')

Filling name with 'Not Available'
Filling description with 'Not Available'
Filling state with 'Not Available'
Filling country with 'Not Available'
Filling city with 'Not Available'
Filling zip_code with 'Not Available'
Filling address with 'Not Available'
Filling url with 'Not Available'


Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url
0,1009,IBM,At IBM we do more than work We create We creat...,7.0,NY,US,Armonk New York,10504,International Business Machines Corp,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,Not Available,US,Chicago,0,,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,GE Power part of GE Vernova is a world energy ...,7.0,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,Were a cloud technology company that provides ...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle
...,...,...,...,...,...,...,...,...,...,...
6058,3700144594,BYREDO,Beauty can be many things to many people Our j...,3.0,Not Available,0,0,0,0,https://www.linkedin.com/company/byredo
6059,3700144710,Pros2Plan a division of Spinnaker SCA,Pros2Plan a division of Spinnaker Services LL...,2.0,CO,US,Boulder,80303,0,https://www.linkedin.com/company/pros2plan
6060,3700147810,Ascendo Resources,Ascendo Resources is a certified minority owne...,3.0,FL,US,Coral Gables,33134,2 Alhambra Plaza,https://www.linkedin.com/company/ascendoresources
6061,3700150295,The Crox Group,The Crox Group Head Quartered in Chicago with ...,2.0,illinois,US,Lincolnwood,60712,6818 N Lincoln Ave,https://www.linkedin.com/company/the-crox-group


In [161]:
count_zeros_in_columns(comp)

company_id has 0 zero values
name has 0 zero values
description has 0 zero values
company_size has 0 zero values
state has 0 zero values
country has 0 zero values
city has 0 zero values
zip_code has 0 zero values
address has 0 zero values
url has 0 zero values


### Duplicates check

In [162]:
comp.shape

(6063, 10)

### Removing duplicates

In [163]:
comp = comp.drop_duplicates()

In [164]:
count_duplicates_per_column(comp)

company_id         0
name              37
description       97
company_size    6055
state           5742
country         6013
city            4402
zip_code        3160
address         1007
url               33
dtype: int64

#### Since we dont have any duplicates in company_id, we can use it as a primary key

# empolyee count dataset exploration

In [165]:
emp_count.head()

Unnamed: 0,company_id,employee_count,follower_count,time_recorded
0,81149246,6,91,1692645000.0
1,10033339,3,187,1692645000.0
2,6049228,20,82,1692645000.0
3,2641066,45,2336,1692645000.0
4,96649998,0,2,1692645000.0


In [166]:
count_zeros_in_columns(emp_count)

company_id has 0 zero values
employee_count has 55 zero values
follower_count has 10 zero values
time_recorded has 0 zero values


There are few values with 0 employee count and/or 0 followers; this is possible since this is LinkedIn data; we will keep as is

### Duplicates check 

In [167]:
count_duplicates_per_column(emp_count)

company_id         9877
employee_count    13268
follower_count     7820
time_recorded     14822
dtype: int64

Removing duplicates

In [168]:
emp_count=emp_count.drop_duplicates()

In [169]:
emp_count.shape

(12551, 4)

In [170]:
count_duplicates_per_column(emp_count)

company_id         6521
employee_count     9912
follower_count     4464
time_recorded     11466
dtype: int64

#### Duplicates can occur since data is recorded at different times and hence duplication of company_id is expected

#### Converting time_recorded to date-time stamp

In [171]:
emp_count['time_recorded_ts'] = pd.to_datetime(emp_count['time_recorded'], unit='ms')

# company_specialities dataset exploration

In [172]:
comp_spec.head()

Unnamed: 0,company_id,speciality
0,81149246,Childrens Music Education
1,81149246,Foundational Music Theory
2,81149246,Child Music Lessons
3,81149246,social emotional learning
4,81149246,social emotional development


In [173]:
comp_spec.shape

(128355, 2)

#### Removing duplicates

In [174]:
comp_spec = comp_spec.drop_duplicates()

In [175]:
comp_spec.shape

(42605, 2)

In [176]:
count_zeros_in_columns(comp_spec)

company_id has 0 zero values
speciality has 0 zero values


#### Cleaning non-ASCII values from speciality column

In [177]:
comp_spec = convert_object_columns_to_string(comp_spec)
comp_spec['speciality'] = comp_spec.apply(lambda row: clean_non_ascii_new_line_string(row['speciality']), axis=1)
print(comp_spec['speciality'].head(20))

0        Childrens Music Education
1        Foundational Music Theory
2              Child Music Lessons
3        social emotional learning
4     social emotional development
5                        education
6             formative assessment
7                expanded learning
8              enrichment programs
9                       SharePoint
10                      Office 365
11                      Consulting
12              Project management
13                        InfoPath
14          Government contracting
15                       PowerApps
16                  Microsoft Flow
17                           Agile
18                          DevOps
19                 Cloud computing
Name: speciality, dtype: object


### Duplicates can occur since occur since 1 company can have multiple specialities

In [178]:
count_duplicates_per_column(comp_spec)

company_id    38118
speciality    17882
dtype: int64

# company_industries dataset exploration

In [179]:
comp_ind.head()

Unnamed: 0,company_id,industry
0,81149246,Higher Education
1,10033339,Information Technology & Services
2,6049228,Accounting
3,2641066,Electrical & Electronic Manufacturing
4,96649998,Marketing & Advertising


In [180]:
comp_ind = comp_ind.drop_duplicates()

In [181]:
comp_ind.shape

(6003, 2)

In [182]:
count_zeros_in_columns(comp_ind)

company_id has 0 zero values
industry has 0 zero values


### Duplicates check

In [183]:
count_duplicates_per_column(comp_ind)

company_id       0
industry      5862
dtype: int64

#### Cleaning non-ASCII values from industry column

In [184]:
comp_ind = convert_object_columns_to_string(comp_ind)
comp_ind['industry'] = comp_ind.apply(lambda row: clean_non_ascii_new_line_string(row['industry']), axis=1)
print(comp_ind['industry'].head(20))

0                         Higher Education
1         Information Technology  Services
2                               Accounting
3     Electrical  Electronic Manufacturing
4                   Marketing  Advertising
5                    Hospital  Health Care
6         Information Technology  Services
7                  Logistics  Supply Chain
8                         Medical Practice
9                       Mental Health Care
10        Information Technology  Services
11                  Architecture  Planning
12       Recreational Facilities  Services
13                                Internet
14                    Staffing  Recruiting
15                                Research
16                       Civil Engineering
17                   Management Consulting
18                   Management Consulting
19                      Mental Health Care
Name: industry, dtype: object


# job skills

In [185]:
job_skill.head()

Unnamed: 0,job_id,skill_abr
0,3690843087,ACCT
1,3690843087,FIN
2,3691763971,MGMT
3,3691763971,MNFC
4,3691775263,MGMT


In [186]:
count_zeros_in_columns(job_skill)

job_id has 0 zero values
skill_abr has 0 zero values


### Duplicate counts check

In [187]:
count_duplicates_per_column(job_skill)

job_id       12262
skill_abr    27864
dtype: int64

In [188]:
job_skill = job_skill.drop_duplicates()

#### Cleaning non-ASCII values from abbreviation column

In [189]:
job_skill = convert_object_columns_to_string(job_skill)
job_skill['skill_abr'] = job_skill.apply(lambda row: clean_non_ascii_new_line_string(row['skill_abr']), axis=1)
print(job_skill['skill_abr'].head(20))

0     ACCT
1      FIN
2     MGMT
3     MNFC
4     MGMT
5     MNFC
6     HCPR
7     MGMT
8     MNFC
9     HCPR
10     ENG
11      IT
12     ADM
13     ADM
14    HCPR
15    SALE
16      IT
17    DSGN
18     ART
19      IT
Name: skill_abr, dtype: object


# job_industries

In [190]:
job_ind.head()

Unnamed: 0,job_id,industry_id
0,3378133231,68
1,3497509795,96
2,3690843087,47
3,3691775263,112
4,3691779379,80


In [191]:
count_zeros_in_columns(job_ind)

job_id has 0 zero values
industry_id has 0 zero values


### Duplicates check

In [192]:
job_ind = job_ind.drop_duplicates()

In [193]:
count_duplicates_per_column(job_ind)

job_id          6033
industry_id    21791
dtype: int64

In [194]:
job_ind.shape

(21993, 2)

# benefits

In [195]:
benefits.head()

Unnamed: 0,job_id,inferred,type
0,3690843087,0,Medical insurance
1,3690843087,0,Dental insurance
2,3690843087,0,401(k)
3,3690843087,0,Paid maternity leave
4,3690843087,0,Disability insurance


In [196]:
count_zeros_in_columns(benefits)

job_id has 0 zero values
inferred has 7151 zero values
type has 0 zero values


### inferred column can be zero - no change required

### Duplicates check

In [197]:
benefits = benefits.drop_duplicates()

In [198]:
count_duplicates_per_column(benefits)

job_id       8265
inferred    13759
type        13749
dtype: int64

In [199]:
benefits.shape

(13761, 3)

#### Cleaning non-ASCII values from type column

In [200]:
benefits = convert_object_columns_to_string(benefits)
benefits['type'] = benefits.apply(lambda row: clean_non_ascii_new_line_string(row['type']), axis=1)
print(benefits['type'].head(20))

0        Medical insurance
1         Dental insurance
2                     401k
3     Paid maternity leave
4     Disability insurance
5         Vision insurance
6         Dental insurance
7     Disability insurance
8                     401k
9        Medical insurance
10        Vision insurance
11        Dental insurance
12                    401k
13      Tuition assistance
14    Disability insurance
15       Medical insurance
16                    401k
17            Pension plan
18        Dental insurance
19       Medical insurance
Name: type, dtype: object


# job posting dataset exploration

In [201]:
job_post.head()

Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type
0,85008768,,Licensed Insurance Agent,While many industries were hurt by the last fe...,52000.0,,45760.0,YEARLY,Full-time,"Chico, CA",...,1710000000000.0,,,,1690000000000.0,,1,FULL_TIME,USD,BASE_SALARY
1,133114754,77766802.0,Sales Manager,Are you a dynamic and creative marketing profe...,,,,,Full-time,"Santa Clarita, CA",...,1700000000000.0,,,,1690000000000.0,,0,FULL_TIME,,
2,133196985,1089558.0,Model Risk Auditor,Join Us as a Model Risk Auditor – Showcase You...,,,,,Contract,"New York, NY",...,1700000000000.0,,,,1690000000000.0,,0,CONTRACT,,
3,381055942,96654609.0,Business Manager,Business ManagerFirst Baptist Church ForneyFor...,,,,,Full-time,"Forney, TX",...,1700000000000.0,,,,1690000000000.0,,0,FULL_TIME,,
4,529257371,1244539.0,NY Studio Assistant,YOU COULD BE ONE OF THE MAGIC MAKERS\nKen Fulk...,,,,,Full-time,"New York, NY",...,1710000000000.0,,,,1690000000000.0,,1,FULL_TIME,,


In [202]:
job_post.shape

(15886, 27)

Null or blank company_id can't be used, hence dropping these values

In [203]:
job_post.dropna(subset=['company_id'], inplace=True)

In [204]:
job_post.shape

(15520, 27)

We're losing 366 values due to removing NA - this is negligible and is an acceptable level of missing values being removed

In [205]:
job_post.describe()

Unnamed: 0,job_id,company_id,max_salary,med_salary,min_salary,applies,original_listed_time,remote_allowed,views,expiry,closed_time,listed_time,sponsored
count,15520.0,15520.0,5404.0,968.0,5404.0,8504.0,15520.0,2273.0,12819.0,15520.0,915.0,15520.0,15520.0
mean,3692053000.0,10841000.0,88758.07,41265.693957,62584.169637,23.040687,1690000000000.0,1.0,77.499337,1700672000000.0,1690000000000.0,1690000000000.0,0.290077
std,92697280.0,23136880.0,90512.85,94155.023951,59171.266876,55.412227,0.0,0.0,168.773428,2559816000.0,0.0,0.0,0.453812
min,133114800.0,1009.0,10.0,10.0,10.0,1.0,1690000000000.0,1.0,1.0,1690000000000.0,1690000000000.0,1690000000000.0,0.0
25%,3693071000.0,13011.0,48.7275,18.0,38.0,2.0,1690000000000.0,1.0,6.0,1700000000000.0,1690000000000.0,1690000000000.0,0.0
50%,3697359000.0,277768.5,83124.5,25.84,60300.0,6.0,1690000000000.0,1.0,26.0,1700000000000.0,1690000000000.0,1690000000000.0,0.0
75%,3699413000.0,7798499.0,140000.0,52000.0,100000.0,21.0,1690000000000.0,1.0,79.0,1700000000000.0,1690000000000.0,1690000000000.0,1.0
max,3701374000.0,98562220.0,1300000.0,998426.0,800000.0,1615.0,1690000000000.0,1.0,5656.0,1710000000000.0,1690000000000.0,1690000000000.0,1.0


In [206]:
for col in job_post.columns:
    print(col)

job_post.dtypes

job_id
company_id
title
description
max_salary
med_salary
min_salary
pay_period
formatted_work_type
location
applies
original_listed_time
remote_allowed
views
job_posting_url
application_url
application_type
expiry
closed_time
formatted_experience_level
skills_desc
listed_time
posting_domain
sponsored
work_type
currency
compensation_type


job_id                          int64
company_id                    float64
title                          object
description                    object
max_salary                    float64
med_salary                    float64
min_salary                    float64
pay_period                     object
formatted_work_type            object
location                       object
applies                       float64
original_listed_time          float64
remote_allowed                float64
views                         float64
job_posting_url                object
application_url                object
application_type               object
expiry                        float64
closed_time                   float64
formatted_experience_level     object
skills_desc                    object
listed_time                   float64
posting_domain                 object
sponsored                       int64
work_type                      object
currency                       object
compensation

### Changing non-ASCII values to normal for all descriptive columns

In [207]:
job_post = convert_object_columns_to_string(job_post)
job_post['title'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['title']), axis=1)
job_post['description'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['description']), axis=1)
job_post['formatted_work_type'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['formatted_work_type']), axis=1)
job_post['location'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['location']), axis=1)
job_post['skills_desc'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['skills_desc']), axis=1)
job_post['formatted_experience_level'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['formatted_experience_level']), axis=1)
job_post['posting_domain'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['posting_domain']), axis=1)
job_post['work_type'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['work_type']), axis=1)
job_post['currency'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['currency']), axis=1)
job_post['compensation_type'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['compensation_type']), axis=1)
#job_post['job_posting_url'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['job_posting_url']), axis=1)
#job_post['application_url'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['application_url']), axis=1)
job_post['application_type'] = job_post.apply(lambda row: clean_non_ascii_new_line_string(row['application_type']), axis=1)

### Since different columns have different nuances, we will need to carefully choose how to impute

Salary columns (min, median, max) - will not impute since not every company's job posting is obligated to have salary ranges

For all character columns, we will impute with 'Not Available' 

In [208]:
fill_null_values_column(job_post, 'title', 'na') 
fill_null_values_column(job_post, 'description', 'na')
fill_null_values_column(job_post, 'formatted_work_type', 'na')
fill_null_values_column(job_post, 'location', 'na')
fill_null_values_column(job_post, 'skills_desc', 'na')
fill_null_values_column(job_post, 'formatted_experience_level', 'na')
fill_null_values_column(job_post, 'posting_domain', 'na')
fill_null_values_column(job_post, 'work_type', 'na')
fill_null_values_column(job_post, 'currency', 'na')
fill_null_values_column(job_post, 'compensation_type', 'na')
fill_null_values_column(job_post, 'job_posting_url', 'na')
fill_null_values_column(job_post, 'application_url', 'na')
fill_null_values_column(job_post, 'application_type', 'na')

Filling title with 'Not Available'
Filling description with 'Not Available'
Filling formatted_work_type with 'Not Available'
Filling location with 'Not Available'
Filling skills_desc with 'Not Available'
Filling formatted_experience_level with 'Not Available'
Filling posting_domain with 'Not Available'
Filling work_type with 'Not Available'
Filling currency with 'Not Available'
Filling compensation_type with 'Not Available'
Filling job_posting_url with 'Not Available'
Filling application_url with 'Not Available'
Filling application_type with 'Not Available'


Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type
1,133114754,77766802.0,Sales Manager,Are you a dynamic and creative marketing profe...,,,,,Fulltime,Santa Clarita CA,...,1.700000e+12,,,,1.690000e+12,,0,FULL_TIME,,
2,133196985,1089558.0,Model Risk Auditor,Join Us as a Model Risk Auditor Showcase Your...,,,,,Contract,New York NY,...,1.700000e+12,,,,1.690000e+12,,0,CONTRACT,,
3,381055942,96654609.0,Business Manager,Business ManagerFirst Baptist Church ForneyFor...,,,,,Fulltime,Forney TX,...,1.700000e+12,,,,1.690000e+12,,0,FULL_TIME,,
4,529257371,1244539.0,NY Studio Assistant,YOU COULD BE ONE OF THE MAGIC MAKERS\nKen Fulk...,,,,,Fulltime,New York NY,...,1.710000e+12,,,,1.690000e+12,,1,FULL_TIME,,
5,903408693,3894635.0,Office Associate,Provide clerical and administrative support to...,42000.0,,37000.0,YEARLY,Fulltime,Albany GA,...,1.710000e+12,,,,1.690000e+12,,1,FULL_TIME,USD,BASE_SALARY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15881,3701373516,74718032.0,Sanitation Technician,Location\n\nWest Columbia SC US 29172\n\n29172...,,,,,Parttime,West Columbia SC,...,1.700000e+12,,Entry level,,1.690000e+12,aspirebakeriescareerscom,0,PART_TIME,,
15882,3701373522,38897.0,Unit Secretary,Job Title Unit Secretary\nDepartment Nursing\n...,,,,,Fulltime,Teaneck NJ,...,1.700000e+12,,Entry level,,1.690000e+12,recruitingultiprocom,0,FULL_TIME,,
15883,3701373523,38897.0,Radiology Aide Perdiem,Job Title Radiology Aide Perdiem\nDepartment C...,,,,,Parttime,Teaneck NJ,...,1.700000e+12,,Entry level,,1.690000e+12,recruitingultiprocom,0,PART_TIME,,
15884,3701373524,2623.0,MRI Manager,Grade 105\nJob Type Officer of AdministrationB...,135000.0,,110000.0,YEARLY,Fulltime,New York NY,...,1.700000e+12,,MidSenior level,,1.690000e+12,opportunitiescolumbiaedu,0,FULL_TIME,USD,BASE_SALARY


#### Converting float64 timestamp format to date-time stamp

In [209]:
job_post['listed_time_ts'] = pd.to_datetime(job_post['listed_time'], unit='ms')
job_post['expiry_ts'] = pd.to_datetime(job_post['expiry'], unit='ms')
job_post['closed_time_ts'] = pd.to_datetime(job_post['closed_time'], unit='ms')

#### Cleaning non-ASCII values from abbreviation column

In [210]:
count_zeros_in_columns(job_post)

job_id has 0 zero values
company_id has 0 zero values
title has 0 zero values
description has 0 zero values
max_salary has 0 zero values
med_salary has 0 zero values
min_salary has 0 zero values
pay_period has 0 zero values
formatted_work_type has 0 zero values
location has 0 zero values
applies has 0 zero values
original_listed_time has 0 zero values
remote_allowed has 0 zero values
views has 0 zero values
job_posting_url has 0 zero values
application_url has 0 zero values
application_type has 0 zero values
expiry has 0 zero values
closed_time has 0 zero values
formatted_experience_level has 0 zero values
skills_desc has 0 zero values
listed_time has 0 zero values
posting_domain has 0 zero values
sponsored has 11018 zero values
work_type has 0 zero values
currency has 0 zero values
compensation_type has 0 zero values
listed_time_ts has 0 zero values
expiry_ts has 0 zero values
closed_time_ts has 0 zero values


### Duplicate

In [211]:
job_post = job_post.drop_duplicates()

In [212]:
count_duplicates_per_column(job_post)

job_id                            0
company_id                     9490
title                          4792
description                    2032
max_salary                    14209
med_salary                    15206
min_salary                    14292
pay_period                    15516
formatted_work_type           15513
location                      12562
applies                       15235
original_listed_time          15519
remote_allowed                15518
views                         14808
job_posting_url                   0
application_url                6634
application_type              15517
expiry                        15517
closed_time                   15518
formatted_experience_level    15513
skills_desc                   15378
listed_time                   15519
posting_domain                14223
sponsored                     15518
work_type                     15513
currency                      15518
compensation_type             15518
listed_time_ts              

job_id doesn't have any duplicates - this will serve as our primary key

# Download the dataset

In [214]:
export_csv_path = 'D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab1\\Output_files\\'

emp_count.to_csv(export_csv_path+'employee_counts_clean.csv')
comp_spec.to_csv(export_csv_path+'company_specialities_clean.csv')
comp_ind.to_csv(export_csv_path+'company_industries_clean.csv')
comp.to_csv(export_csv_path+'companies_clean.csv')
job_skill.to_csv(export_csv_path+'job_skills_clean.csv')
job_ind.to_csv(export_csv_path+'job_industries_clean.csv')
benefits.to_csv(export_csv_path+'benefits_clean.csv')
job_post.to_csv(export_csv_path+'job_postings_clean.csv')