# 

In [1]:
import numpy as np
np.random.seed(42)

# Database
from JobsDb import JobsDb

# Verification
from sklearn.model_selection import train_test_split

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Dimension Reduction
from sklearn.decomposition import TruncatedSVD

# Clustering
from sklearn.cluster import KMeans

# Classifier
from sklearn.ensemble import RandomForestClassifier

# Pipeline
from sklearn.pipeline import Pipeline

In [2]:
db = JobsDb()
df = db.load_table_as_df('jobs')
db.close()
df = df.iloc[9680:]
print(df.shape)
df.head()

(9485, 4)


Unnamed: 0,id,title,url,description
9680,9681,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...
9681,9682,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat..."
9682,9683,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...
9683,9684,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...
9684,9685,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi..."


In [3]:
descriptions = df['description']

In [4]:
train, test = train_test_split(descriptions, test_size=0.2)

In [7]:
preprocessing_pipe = Pipeline(
    steps = [
        ('vectorization', TfidfVectorizer()),
        ('dimensionReduction', TruncatedSVD())
    ]
)

In [8]:
clustering_pipe = Pipeline(
    steps = [
        ('preprocessing', preprocessing_pipe),
        ('clustering', KMeans()),
    ]
)

In [9]:
target = clustering_pipe.fit_predict(train)

In [10]:
classification_pipe = Pipeline(
    steps = [
        ('preprocessing', preprocessing_pipe),
        ('classification', RandomForestClassifier)
    ]
)