In [3]:
##Libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import os

In [4]:
##Load the file
path = '/Users/leonidas/Documents/GitHub/Deep-Learning-Project/Results/'
filename = '25_cleaned_job_descriptions.csv'
data = pd.read_csv(os.path.join(path, filename), header = 0, names = ['Query', 'Description'])

In [15]:
##In logistic regression models, encoding all of the independent variables as dummy variables allows easy interpretation and calculation of the odds ratios, and increases the stability and significance of the coefficients.
data2 = pd.get_dummies(data, columns =['Description'])

In [16]:
##Split the data into training and test sets
X = data2.iloc[:,1:]
y = data2.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [21]:
##Check out training data is sufficient
X_train.shape

(7500, 9367)

In [17]:
##Fit logistic regression to the training set
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
##Predicting the test set results and creating confusion matrix
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[  3   0   0   0  91   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   2   0   0   0   0]
 [  2   2   0   0  83   0   0   0   2   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [  0   0   0   0 113   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   1   0   0   0   0   0]
 [  0   0   1   0 102   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [  0   0   0   0 115   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   3   0   0   0]
 [  1   0   0   0  82  20   0   0   0   0   0   0   0   0   0   0   0   1
    0   0   0   0   0   0   0]
 [  0   0   0   1  81   0   1   0   0   0   0   0   1   0   0   0   0   0
    0   0   0   0   1   0   0]
 [  0   0   0   0  93   0   0   5   0   0   0   0   1   0   0   0   0   0
    0   0   0   0   0   0   0]
 [  0   0   0   0  93   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [  0   0   0   0  86   0   0   0   0   1   0   0   0  

In [19]:
##Accuracy
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.12


In [20]:
##Compute precision, recall, F-measure and support
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

                               precision    recall  f1-score   support

      Artificial Intelligence       0.38      0.03      0.06        96
            Big Data Engineer       0.67      0.02      0.04        89
             Business Analyst       0.00      0.00      0.00       114
Business Intelligence Analyst       0.00      0.00      0.00       103
              Cloud Architect       0.05      0.97      0.10       118
     Cloud Services Developer       1.00      0.19      0.32       104
                 Data Analyst       0.33      0.01      0.02        85
               Data Architect       1.00      0.05      0.10        99
                Data Engineer       0.00      0.00      0.00        93
         Data Quality Manager       1.00      0.01      0.02        87
               Data Scientist       0.67      0.02      0.04       107
    Data Visualization Expert       0.00      0.00      0.00        94
             Data Warehousing       0.75      0.06      0.11       103
   Da

  'precision', 'predicted', average, warn_for)
