# data preparation 

In [30]:
import pandas as pd
data = pd.read_excel('job_classification_data.xlsx')
x = data['titles'].tolist()
y = data['description'].tolist()

In [31]:
x   

['data analyst junior',
 'data analyst senior ',
 'business analyst',
 'software engineer  junior',
 'software engineer  senior',
 'software manager',
 'software director',
 'project manager',
 'sales manager']

In [32]:
y

['"JOB DESCRIPTION\n\t\t\t \xa0Data Analyst, Group Customer Analytics & Decisioning\xa0\xa0About OCBCHaving purpose and making a real impact drives what we do at OCBC, for our customers, colleagues and the community. People count on us to deliver an amazing banking experience, but more importantly one that is uncomplicated and complements their changing lifestyle and business needs. Our employees are the architects that make this all possible; they are high performing individuals and teams, experts in their field and above all believe in our purpose and commitment to customers.Our community of game changers at OCBC builds for the future; think Ambitious goals, act with a strong Belief and execute with Courage and agility to drive impact. These ABCs sit at the centre of our culture of change and innovation.\xa0\xa0What you’ll doWork with specific client groups in the Consumer Bank to understand their business challenges & identify areas where Customer analytics techniques could help the

# tf-idf Vectorizer

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=200000,
                                   stop_words='english',
                                   use_idf=True,
                                   max_df=0.8,min_df=0.2,
                                   ngram_range=(1,2))

tfidf_matrix = tfidf_vectorizer.fit_transform(y)


In [34]:
terms = tfidf_vectorizer.get_feature_names()
terms

['00',
 '01',
 '01 asia',
 '01 september',
 '01 singapore',
 '018960',
 '018960 job',
 '018989',
 '018989 job',
 '02',
 '02 august',
 '03',
 '04',
 '048616',
 '048616 job',
 '05',
 '06',
 '07',
 '08',
 '08c2893',
 '08c2893 reg',
 '08c2893karthiga',
 '08c2893karthiga anandan',
 '09',
 '09 61',
 '099453',
 '10 01',
 '10 marina',
 '10 years',
 '11',
 '11 01',
 '11 years',
 '11c5781',
 '12',
 '124',
 '124 143',
 '13',
 '13c6678to',
 '13c6678to job',
 '143',
 '143 166',
 '15',
 '15 years',
 '166',
 '166 address',
 '17',
 '18',
 '19',
 '20',
 '20 jalan',
 '2019 job',
 '24',
 '25',
 '31',
 '365',
 '3rd',
 '3rd party',
 '40',
 '513',
 '513 982e',
 '5500',
 '5k',
 '5k aws',
 '60',
 '6000',
 '61',
 '61 124',
 '61 singapore',
 '6416',
 '65',
 '65 6416',
 '6590',
 '97',
 '97 97e',
 '97e',
 '97e 513',
 '982e',
 '982e address',
 'ability build',
 'ability communicate',
 'ability deal',
 'ability deliver',
 'ability explain',
 'ability initiate',
 'ability learn',
 'ability manage',
 'ability priorit

# k-means classification

In [55]:
#  classification

from sklearn.cluster import KMeans
num_clusters = 4
km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)




Wall time: 614 ms


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

# result interpretation

In [56]:
# build dataframe
clusters = km.labels_.tolist()
results = pd.DataFrame()
results['title'] = x
results['cluster'] = km.labels_
results

Unnamed: 0,title,cluster
0,data analyst junior,1
1,data analyst senior,1
2,business analyst,1
3,software engineer junior,0
4,software engineer senior,0
5,software manager,2
6,software director,2
7,project manager,3
8,sales manager,3


In [57]:
# number of job titles per cluster 

results['cluster'].value_counts() 

1    3
3    2
2    2
0    2
Name: cluster, dtype: int64

In [58]:
# job titles in each cluster

results.groupby(by='cluster').apply(lambda x:[','.join(x['title'])]) 

cluster
0    [software engineer  junior,software engineer  ...
1    [data analyst junior,data analyst senior ,busi...
2                 [software manager,software director]
3                      [project manager,sales manager]
dtype: object

In [59]:
# Top keywords per cluster

print("Top keywords per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
order_centroids
terms = tfidf_vectorizer.get_feature_names()
for i in range(num_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :20]]
    print("Cluster {}: {}".format(i,' '.join(top_ten_words)))

Top keywords per cluster:
Cluster 0: work location software engineer programming engineer solution engineers code web location address consulting platform real time application development design develop oracle scalable ai architecture cloud improve
Cluster 1: analytics work location data analysis excel personal caed caed com mining view personal data analyst reports business requirements banking privacy policy data analyst location address policy privacy view larger
Cluster 2: architecture leadership sap enterprise dynamics iot cloud solution vendor review business applications marketing architect objectives application development lead cyber strategic drive pay
Cluster 3: risk ll ey area solution clients professional better risk management progress pmo certified consulting digital plan long pre sales budget project manager pmp
