# Data Science Challenge

## Preparing

Fist run:

    pipenv install
    pipenv shell
    python -m ipykernel install --user --name=<env-name>
    jupyter notebook

Once the kernel is installed:

   pipenv run jupyter notebook

In [1]:
import pickle
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, GridSearchCV
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Easier to compare results
pd.options.display.float_format = '{:.5f}'.format

from IPython.display import display

from langdetect import detect as langdetect
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ronie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load data and check the format

In [3]:
raw = pd.read_pickle('data/labelled_dataset.pickle')
raw.head()

Unnamed: 0,text,labelmax
0,Pros - The people who work here are brilliant ...,customer
1,Pros Start-up vibes Fast growing company Tech-...,customer
2,"Pros The team is great, I love the ambition of...",collaboration
3,"Pros The company is constantly growing, and at...",adaptability
4,Pros Cool office. Friendly people. Good atmosp...,collaboration


In [4]:
raw.text[0]

"Pros - The people who work here are brilliant (intelligent, hard-working etc.) - Exciting career opportunities, plenty of room to grow! - Great company culture, social events etc. - Management really value everyone's opinion and are open to ideas - Ambitious company, always working to grow and improve - I feel like my work is really valued, and outstanding performance is always recognised by management Cons - Salary isn't great compared to other grad jobs, hopefully this will improve as the company grows - Occasionally have to work weekends, would much rather there was a separate weekend team - Communication could be better about changes in the company, future plans etc. Advice to Management - Keep listening to your employees - Don't become cold and corporate. Airsorted needs to keep the company spirit, even when it is a big global company."

The classes are unbalanced... hmmm..

In [6]:
raw.labelmax.value_counts()

customer         26981
collaboration    21067
result           18948
adaptability     17204
detail            4030
integrity         2815
null               535
Name: labelmax, dtype: int64

## Cleaning data

In [7]:
raw['lang'] = raw.text.apply(langdetect) # Language Detection
raw.head()

Unnamed: 0,text,labelmax,lang
0,Pros - The people who work here are brilliant ...,customer,en
1,Pros Start-up vibes Fast growing company Tech-...,customer,en
2,"Pros The team is great, I love the ambition of...",collaboration,en
3,"Pros The company is constantly growing, and at...",adaptability,en
4,Pros Cool office. Friendly people. Good atmosp...,collaboration,en


In [9]:
# Uncomment during development
#raw.to_pickle('data/labelled_dataset_lang.pickle')
raw = pd.read_pickle('data/labelled_dataset_lang.pickle')

In [15]:
display(raw.lang.value_counts())
clean = raw[(raw.labelmax != 'null') & (raw.lang == 'en')] # Keep only english comments with classification

en    88711
de     2090
fr      502
nl      251
es       17
pt        4
it        2
af        1
no        1
cy        1
Name: lang, dtype: int64

In [16]:
from nltk.stem.snowball import EnglishStemmer

stemmer = EnglishStemmer()
analyzer = CountVectorizer().build_analyzer()

def process_words(doc):
    words = [stemmer.stem(w) for w in analyzer(doc)]
    return ['NUMBER' if word.isdigit() else word for word in words]

In [17]:
features_train, features_test, target_train, target_test = train_test_split(clean.text, clean.labelmax, test_size=0.1, stratify=clean.labelmax)

word_counter = TfidfVectorizer(stop_words='english',
                               analyzer=process_words,
                               ngram_range=(1, 3))
features = word_counter.fit_transform(features_train)
feature_names = word_counter.get_feature_names()
display(len(feature_names))

29654

In [19]:
model = LinearSVC(class_weight={'detail': 3, 'integrity': 3}, penalty='l1', dual=False, C=0.25)
rfe = RFE(model, 5000, step=0.25)
rfe.fit(features, target_train)

RFE(estimator=LinearSVC(C=0.25, class_weight={'detail': 3, 'integrity': 3}, dual=False,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l1', random_state=None,
     tol=0.0001, verbose=0),
  n_features_to_select=5000, step=0.25, verbose=0)

In [21]:
print(len(np.array(feature_names)[rfe.support_]))

5000


In [28]:
model = LinearSVC(class_weight={'detail': 3, 'integrity': 3}, penalty='l1', dual=False, C=0.25)
model.fit(rfe.transform(features), target_train)

predicted_train = model.predict(rfe.transform(features))
predicted_test = model.predict(rfe.transform(word_counter.transform(features_test)))

labels = sorted(set(target_train))
cm_train = confusion_matrix(target_train, predicted_train, labels)
cm_test = confusion_matrix(target_test, predicted_test, labels)

display('Train Accuracy: {}'.format(accuracy_score(target_train, predicted_train)))
display('Test Accuracy: {}'.format(accuracy_score(target_test, predicted_test)))

'Train Accuracy: 0.909261765148098'

'Test Accuracy: 0.8800315706393055'

In [None]:
fig, charts = plt.subplots(2, 2, figsize=(15, 6), sharex=True, sharey=True)

charts[0, 0].set_title('Precision Train')
sns.heatmap(cm_train / cm_train.sum(axis=0),
            ax=charts[0, 0],
            annot=True,
            fmt='.2f',
            xticklabels=labels,
            yticklabels=labels,
            vmin=0, vmax=1)

charts[0, 1].set_title('Recall Train')
sns.heatmap(cm_train / cm_train.sum(axis=1),
            ax=charts[0, 1],
            annot=True,
            fmt='.2f',
            xticklabels=labels,
            yticklabels=labels,
            vmin=0, vmax=1)

charts[1, 0].set_title('Precision Test')
sns.heatmap(cm_test / cm_test.sum(axis=0),
            ax=charts[1, 0],
            annot=True,
            fmt='.2f',
            xticklabels=labels,
            yticklabels=labels,
            vmin=0, vmax=1)

charts[1, 1].set_title('Recall Test')
sns.heatmap(cm_test / cm_test.sum(axis=1),
            ax=charts[1, 1],
            annot=True,
            fmt='.2f',
            xticklabels=labels,
            yticklabels=labels,
            vmin=0, vmax=1)


# Run with unlabelled data

In [None]:
import json
from pathlib import Path

def read_json(path):
    with path.open() as file:
        return json.load(file)
    
def save_json(path, data):
    with path.open('w') as file:
        json.dump(data, file)

data = pd.read_pickle('data/labelled_dataset.pickle')

model = pipeline_with(LinearSVC())
model.fit(data.text, data.labelmax)

Path('./data/labelled-dataset').mkdir(exist_ok=True)
path = Path('./data/unlabelled-dataset').glob('*.json')
for input_file in path:
    data = read_json(input_file)
    if len(data) == 0: continue
        
    features = [doc['text'] for doc in data]
    predicted = model.predict(features)
    
    for doc, target in zip(data, predicted):
        doc['predicted'] = target
    
    output_file = Path(file.parents[1]).joinpath('labelled-dataset', file.name)
    save_json(output_file, data)
