In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.multiclass import OneVsRestClassifier

In [None]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.1 MB)
[K     |████████████████████████████████| 23.1 MB 58.5 MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-1.0 threadpoolctl-2.2.0


In [None]:
df = pd.read_csv('/gdrive/MyDrive/Dataset_Jobdesk/merged_dataset_sectors.csv')

In [None]:
df = pd.read_csv('/gdrive/MyDrive/Dataset_Jobdesk/ini_dataset.csv')

In [None]:
df.head()

Unnamed: 0,texts,sectors
0,Absatzplaner Produktemengenplaner Absatzplaner...,['Marketing']
1,Absenzenmanager Absenzenmanager Absenzenmanage...,['Personalmanagement']
2,Account Manager Kundenbetreuer Account Manager...,['Verkauf/Kundenberatung']
3,Accounting-Project-Manager Projektablaufoptimi...,['Beratung/Recht']
4,After Sales Manager Reklamationsmanager After ...,['Verkauf/Kundenberatung']


In [None]:
df['sectors'].iloc[0]

"['Marketing']"

In [None]:
import ast

In [None]:
ast.literal_eval(df['sectors'].iloc[0])

['Marketing']

In [None]:
df['sectors'] = df['sectors'].apply(lambda x: ast.literal_eval(x))
# df['title'] = df['title'].apply(lambda x: [str(x)])

In [None]:
multilabel = MultiLabelBinarizer()

In [None]:
y = multilabel.fit_transform(df['sectors'])

In [None]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
multilabel.classes_

array(['Administration/Sekretariat/Verwaltung', 'Banken/Versicherungen',
       'Beratung/Recht', 'Chemie/Pharma', 'Diverse',
       'Einkauf/Logistik/Produktion',
       'Finanz- und Rechnungswesen/Controlling',
       'Gastronomie/Hotellerie/Tourismus',
       'Geschäftsführung / Unternehmensleitung',
       'Gesundheitswesen/Medizin', 'Industrie/Ingenieurwesen/Technik',
       'Informatik/Telekommunikation', 'Kaufmännisch', 'Marketing',
       'Non-Profit/Soziales/Bildungswesen', 'Others',
       'Personalmanagement', 'Verkauf/Kundenberatung'], dtype=object)

In [None]:
# tfidf = TfidfVectorizer(analyzer='word', max_features=10000, ngram_range=(1,3))
tfidf = TfidfVectorizer(analyzer='word')
X = tfidf.fit_transform(df['texts'])

In [None]:
X.shape, y.shape

((1412, 1538), (1412, 18))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [None]:
X_train = X
y_train = y

In [None]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()
nb = MultinomialNB()

In [None]:
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
    return jaccard.mean()*100


def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print('Jacard score: {}'.format(j_score(y_test, y_pred)))
    print('----')

In [None]:
for classifier in [sgd, lr, svc, nb]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Clf:  SGDClassifier
Jacard score: 56.33802816901409
----
Clf:  LogisticRegression
Jacard score: 28.87323943661972
----
Clf:  LinearSVC
Jacard score: 50.352112676056336
----
Clf:  MultinomialNB
Jacard score: 33.80281690140845
----


In [None]:
for classifier in [sgd]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Clf:  SGDClassifier
Jacard score: 82.74647887323944
----


In [None]:
x = [' Business Analyst/Requirement Engineer, Data Scientist']

In [None]:
x = ["Einkaufsassistent, Sachbearbeiter, Sachbearbeiter Administration, kaufm. Sachbearbeiter, kaufm. Angestellte/r, Verkauf Innendienst, kaufm. Allrounder"]

In [None]:
x = ["software developer"]

In [None]:
xt = tfidf.transform(x)

In [None]:
xt

<1x37096 sparse matrix of type '<class 'numpy.float64'>'
	with 29 stored elements in Compressed Sparse Row format>

In [None]:
clf.predict(xt)

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
multilabel.inverse_transform(clf.predict(xt))

[('Administration/Sekretariat/Verwaltung',)]

In [None]:
import pickle
print(pickle.format_version)

4.0


In [None]:
with open('/gdrive/MyDrive/Dataset_Jobdesk/sector_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

with open('/gdrive/MyDrive/Dataset_Jobdesk/sector_tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('/gdrive/MyDrive/Dataset_Jobdesk/sector_multilabel.pkl', 'wb') as f:
    pickle.dump(multilabel, f)