## 6.5 Machine Learning Part 2
### This script contains the following:
#### 1. Importing libraries and data and renaming columns
#### 2. The elbow technique
#### 3. k-means clustering

### 1. Importing libraries and data and renaming columns

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.cluster import KMeans 
import pylab as pl 

In [2]:
%matplotlib inline

In [3]:
# Folder location string
path = r'C:\Users\nurgul\Linkedin'

In [4]:
# Importing cleaned dataset
df = pd.read_csv('jobs_posts_clean_onlyUS_v2.csv', index_col=False)

In [5]:
df.shape

(6996, 19)

In [6]:
df.head()

Unnamed: 0,job_id,company_id,title,work_type,location,applies,views,experience_level,listed_time,approx_salary,name,company_size,state,country,zip_code,skill_abr,salary_category,applies_category,industry
0,3757935001,73013724,Sales Manager,Full-time,"Coeur d'Alene, ID",18.329151,66.426309,Mid-Senior level,2023-11-04 06:40:00,237500.0,J. Galt,3.0,Indiana,US,46268,SALE,high,medium,Financial Services
1,3757934319,18391435,Controls Engineer,Full-time,"Orlando, FL",1.0,5.0,Mid-Senior level,2023-11-04 06:40:00,135500.0,head-huntress.com,4.537178,CA,US,92025,ENG,high,low,Staffing and Recruiting
2,3757934318,6577380,CDL Class B Driver,Full-time,"Oakland, CA",15.568389,56.874644,Mid-Senior level,2023-11-04 06:40:00,26.0,Conexwest,2.0,California,US,94607,MGMT,low,medium,Retail Office Equipment
3,3757934267,28352,CDL A Delivery Driver,Full-time,"Swedesboro, NJ",15.568389,56.874644,Entry level,2023-11-04 06:40:00,95000.0,Core-Mark International,6.0,Texas,US,76262,MGMT,medium,medium,Wholesale
4,3757934264,73013724,Sales Manager,Full-time,"Richmond, VA",18.329151,66.426309,Mid-Senior level,2023-11-04 06:40:00,237500.0,J. Galt,3.0,Indiana,US,46268,SALE,high,medium,Financial Services


## 2. Data cleaning

In [7]:
# Check for duplicate rows
duplicates = df[df.duplicated()]

# Print the duplicate rows
print("Duplicate Rows:")
print(duplicates)


Duplicate Rows:
Empty DataFrame
Columns: [job_id, company_id, title, work_type, location, applies, views, experience_level, listed_time, approx_salary, name, company_size, state, country, zip_code, skill_abr, salary_category, applies_category, industry]
Index: []


In [8]:
# Assuming 'df' is your DataFrame
unique_experience_levels = df['experience_level'].unique()

print("Unique Experience Levels:")
print(unique_experience_levels)


Unique Experience Levels:
['Mid-Senior level' 'Entry level' 'Director' 'Associate' 'Internship'
 'Executive']


In [9]:
# Define a mapping of experience levels to numerical values
experience_mapping = {
    'Entry level': 1,
    'Internship': 2,
    'Associate': 3,
    'Mid-Senior level': 4,
    'Director': 5,
    'Executive': 6
}

# Create new columns based on the mapping
for level, value in experience_mapping.items():
    df[f'exp_level_{level.lower().replace(" ", "_")}'] = (df['experience_level'] == level).astype(int)

# Drop the original 'experience_level' column
df = df.drop('experience_level', axis=1)




In [10]:
df.head()

Unnamed: 0,job_id,company_id,title,work_type,location,applies,views,listed_time,approx_salary,name,...,skill_abr,salary_category,applies_category,industry,exp_level_entry_level,exp_level_internship,exp_level_associate,exp_level_mid-senior_level,exp_level_director,exp_level_executive
0,3757935001,73013724,Sales Manager,Full-time,"Coeur d'Alene, ID",18.329151,66.426309,2023-11-04 06:40:00,237500.0,J. Galt,...,SALE,high,medium,Financial Services,0,0,0,1,0,0
1,3757934319,18391435,Controls Engineer,Full-time,"Orlando, FL",1.0,5.0,2023-11-04 06:40:00,135500.0,head-huntress.com,...,ENG,high,low,Staffing and Recruiting,0,0,0,1,0,0
2,3757934318,6577380,CDL Class B Driver,Full-time,"Oakland, CA",15.568389,56.874644,2023-11-04 06:40:00,26.0,Conexwest,...,MGMT,low,medium,Retail Office Equipment,0,0,0,1,0,0
3,3757934267,28352,CDL A Delivery Driver,Full-time,"Swedesboro, NJ",15.568389,56.874644,2023-11-04 06:40:00,95000.0,Core-Mark International,...,MGMT,medium,medium,Wholesale,1,0,0,0,0,0
4,3757934264,73013724,Sales Manager,Full-time,"Richmond, VA",18.329151,66.426309,2023-11-04 06:40:00,237500.0,J. Galt,...,SALE,high,medium,Financial Services,0,0,0,1,0,0


In [11]:
#Finding Missing Values
df.isnull().sum()

job_id                        0
company_id                    0
title                         0
work_type                     0
location                      0
applies                       0
views                         0
listed_time                   0
approx_salary                 0
name                          0
company_size                  0
state                         0
country                       0
zip_code                      0
skill_abr                     0
salary_category               0
applies_category              0
industry                      0
exp_level_entry_level         0
exp_level_internship          0
exp_level_associate           0
exp_level_mid-senior_level    0
exp_level_director            0
exp_level_executive           0
dtype: int64

In [12]:
# Assuming 'df' is your DataFrame
unique_experience_levels = df['work_type'].unique()

print("Unique work type:")
print(unique_experience_levels)

Unique work type:
['Full-time' 'Part-time' 'Contract' 'Other' 'Temporary' 'Internship']


In [13]:
# Define a mapping of work types to numerical values
work_type_mapping = {
    'Full-time': 1,
    'Part-time': 2,
    'Contract': 3,
    'Other': 4,
    'Temporary': 5,
    'Internship': 6
}

# Create new columns based on the mapping
for work_type, value in work_type_mapping.items():
    df[f'work_type_{work_type.lower().replace(" ", "_")}'] = (df['work_type'] == work_type).astype(int)

# Drop the original 'work_type' column
df = df.drop('work_type', axis=1)

In [14]:
df.head()

Unnamed: 0,job_id,company_id,title,location,applies,views,listed_time,approx_salary,name,company_size,...,exp_level_associate,exp_level_mid-senior_level,exp_level_director,exp_level_executive,work_type_full-time,work_type_part-time,work_type_contract,work_type_other,work_type_temporary,work_type_internship
0,3757935001,73013724,Sales Manager,"Coeur d'Alene, ID",18.329151,66.426309,2023-11-04 06:40:00,237500.0,J. Galt,3.0,...,0,1,0,0,1,0,0,0,0,0
1,3757934319,18391435,Controls Engineer,"Orlando, FL",1.0,5.0,2023-11-04 06:40:00,135500.0,head-huntress.com,4.537178,...,0,1,0,0,1,0,0,0,0,0
2,3757934318,6577380,CDL Class B Driver,"Oakland, CA",15.568389,56.874644,2023-11-04 06:40:00,26.0,Conexwest,2.0,...,0,1,0,0,1,0,0,0,0,0
3,3757934267,28352,CDL A Delivery Driver,"Swedesboro, NJ",15.568389,56.874644,2023-11-04 06:40:00,95000.0,Core-Mark International,6.0,...,0,0,0,0,1,0,0,0,0,0
4,3757934264,73013724,Sales Manager,"Richmond, VA",18.329151,66.426309,2023-11-04 06:40:00,237500.0,J. Galt,3.0,...,0,1,0,0,1,0,0,0,0,0


In [15]:
#Finding Missing Values
df.isnull().sum()

job_id                        0
company_id                    0
title                         0
location                      0
applies                       0
views                         0
listed_time                   0
approx_salary                 0
name                          0
company_size                  0
state                         0
country                       0
zip_code                      0
skill_abr                     0
salary_category               0
applies_category              0
industry                      0
exp_level_entry_level         0
exp_level_internship          0
exp_level_associate           0
exp_level_mid-senior_level    0
exp_level_director            0
exp_level_executive           0
work_type_full-time           0
work_type_part-time           0
work_type_contract            0
work_type_other               0
work_type_temporary           0
work_type_internship          0
dtype: int64

In [17]:
# Assuming 'df' is your DataFrame
df.rename(columns={
    'exp_level_entry_level': 'entry_level',
    'exp_level_internship': 'intern',
    'exp_level_associate': 'associate',
    'exp_level_mid-senior_level': 'mid-senior_level',
    'exp_level_director': 'director',
    'exp_level_executive': 'executive',
    'work_type_full-time': 'full-time',
    'work_type_part-time': 'part-time',
    'work_type_contract': 'contract',
    'work_type_other': 'other',
    'work_type_temporary': 'temporary',
    'work_type_internship': 'internship'
}, inplace=True)



In [19]:
df.describe()

Unnamed: 0,job_id,company_id,applies,views,approx_salary,company_size,entry_level,intern,associate,mid-senior_level,director,executive,full-time,part-time,contract,other,temporary,internship
count,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0,6996.0
mean,3726347000.0,8397026.0,16.985214,52.995159,72121.002824,4.606793,0.282161,0.008576,0.147084,0.49271,0.056604,0.012864,0.837621,0.037021,0.111778,0.003573,0.006575,0.003431
std,51614810.0,19667760.0,26.962253,87.240857,70126.492393,1.891885,0.450084,0.092217,0.354215,0.499983,0.2311,0.112698,0.368824,0.188827,0.315116,0.059676,0.080826,0.058474
min,2148435000.0,1016.0,1.0,1.0,10.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3697376000.0,17937.0,4.0,7.0,40.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,3749346000.0,220084.0,13.0,25.015066,70817.5,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,3757445000.0,3711482.0,18.329151,65.0,116750.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,3757935000.0,100746400.0,850.0,1599.0,650000.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
df.shape

(6996, 29)

In [49]:
# Exporting dataframe 
df.to_csv('job_posts_more_numeric.csv', index=False)