# Ciphix Machine Learning Case
Door: Sander Boers


Dit is een opdracht dat onderdeel is van de sollicatie procedure voor Ciphix. Het doel is om uit groot tekstbestand machine-learning technieken toe te passen, om 'topics' te bepalen typerend voor een gesprek.

## Load dataset
There is an option to shrink the dataset to have shorter runtime. Add the following code to end of `pd.read_csv`:
1. `.head(n=10000)` for the first n rows, or
2. `.sample()` if you want a random sample.

Also there is the option to only fit to the customer data (`ONLY_CUSTOMERS = True`) and not the replies of particular companies. It checks if a datarow has an `@123456` (or any other numeric combination) inside, this implies a response to a customer and often does not give additional information.

In [6]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Dataset will be downloaded if it is not already there
if os.path.exists('data.csv'):
    data = pd.read_csv('data.csv', names=['text']).head(n=10000)
else:
    data = pd.read_csv('https://ciphix.io/ai/data.csv', names=['text'])


ONLY_CUSTOMERS = True
if ONLY_CUSTOMERS:
    filter = data['text'].str.contains(r'@([0-9]+)')
    data = data[~filter]

## Preprocessing data
This is an important step in Topic modeling. It consists out of multiple processes, the first is removing information we do not need. This includes:
1. URLs, 
2. mentions (words starting with '@'), 
3. the customer service employee signature (e.g. ^JK),
4. any symbols, emojis or non-western charachters

Next, all words are put into lowercase and will be tokenized, meaning that all sentences will be split into words.

Then, there are three remaining steps:
We look for bigrams and trigrams, these are words that frequently occur together and will be appended to one word (e.g. 'customer', 'service' will become 'customer_service').
Also stopwords will be removed, stopwords do not contribute much to the meaning of a sentence. Stopwords are for example: 'only', 'would', 'some', 'everyone'.

And finally words will be lemmetizized. This step will convert words to its base form, so for example: 'dogs' becomes 'dog' and 'tried' becomes 'try'.


In [7]:
import re
import spacy
import gensim

nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]

def preprocess(doc):
  # Regex to remove URLs
  doc = re.sub(r'\bhttps?://\S+\b', '', doc) 

  # Regex to remove mentions, e.g. @UberSupport
  # doc = re.sub(r'@\w+\b', '', doc) 

  # Regex to remove signatures starting with ('^', '-' or '*')
  doc = re.sub(r'\B[-^*&]\s*\w+', '', doc) 
  
  # Regex to remove any symbols, emojis or non-western charachters
  doc = re.sub(r'[^a-zA-Z0-9\s,.?!;:()]+', '', doc) 
  
  # Function to convert document into lowercase, de-accents and tokenize
  doc = gensim.utils.simple_preprocess(doc, deacc=True, min_len=2)

  return doc
  

data['preprocessed_text'] = data['text'].apply(preprocess)

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data['preprocessed_text'], min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data['preprocessed_text']], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


stopwords = nlp.Defaults.stop_words
stopwords.update(["hi", "hello", "hey", "et", "ill"])

def apply_trigrams(doc):
  doc = [token for token in doc if token not in stopwords]
  doc = trigram_mod[bigram_mod[doc]]
  return doc

data['trigram_text'] = data['preprocessed_text'].apply(apply_trigrams)
data = data[data['trigram_text'].apply(len) > 2]


{'together', 'it', 'seeming', 'n’t', 'made', 'really', "'re", '’d', 'except', 'out', 'mostly', '’re', 'would', 'wherever', 'already', 'though', 'how', 'those', 'own', 'be', 'onto', 'she', 're', 'say', 'something', 'empty', "'m", 'ourselves', 'never', 'hereafter', 'at', 'beforehand', '‘s', '’m', 'are', 'most', 'rather', 'amongst', 'they', 'hundred', 'unless', 'until', 'its', 'make', 'third', 'perhaps', 'more', 'else', 'hence', 'across', 'their', 'wherein', 'very', 'sixty', 'therein', 'mine', '‘m', 'his', 'so', 'my', 'back', 'as', 'although', 'might', 'hi', 'hello', 'myself', 'sometimes', 'fifty', 'forty', 'may', 'yours', 'on', 'eleven', 'next', 'full', 'off', 'when', 'three', 'than', 'bottom', 'quite', 'whereafter', 'thus', 'further', 'nothing', 'almost', 'through', 'who', 'whole', 'if', 'is', 'take', 'whose', 'throughout', 'beyond', 'either', 'call', 'anywhere', 'few', 'others', 'everyone', 'over', 'toward', 'below', 'because', 'we', 'eight', 'yourself', 'from', 'often', "'d", 'ill', '

In [9]:
def apply_lemmatization(doc):
  doc = nlp(" ".join(doc))
  doc = [token.lemma_ for token in doc if token.pos_ in allowed_postags and len(token) > 2]
  return doc

data['lemmatization_text'] = data['trigram_text'].apply(apply_lemmatization)

In [47]:
from gensim.corpora import Dictionary

dictionary = Dictionary(data['lemmatization_text'])
corpus = [dictionary.doc2bow(doc) for doc in data['lemmatization_text']]

## Fit the dataset to LDA model

In [48]:
from gensim.models import LdaModel

lda_model = LdaModel(corpus = corpus, 
                     id2word = dictionary, 
                     num_topics = 10, 
                     passes = 10)

In [49]:
for topic_id in range(10):
    print(f'Topic {topic_id}: {lda_model.print_topic(topic_id)}')

Topic 0: 0.027*"month" + 0.026*"issue" + 0.025*"internet" + 0.022*"comcastcare" + 0.021*"work" + 0.016*"pay" + 0.014*"time" + 0.013*"idea" + 0.013*"ideacare" + 0.013*"talk"
Topic 1: 0.043*"southwestair" + 0.038*"good" + 0.032*"thank" + 0.032*"chipotletweet" + 0.022*"year" + 0.016*"get" + 0.016*"great" + 0.015*"love" + 0.010*"food" + 0.009*"delta"
Topic 2: 0.096*"service" + 0.071*"customer" + 0.030*"upshelp" + 0.022*"call" + 0.020*"bad" + 0.016*"phone" + 0.015*"help" + 0.015*"answer" + 0.014*"day" + 0.014*"care"
Topic 3: 0.053*"help" + 0.037*"email" + 0.036*"need" + 0.035*"account" + 0.034*"number" + 0.025*"check" + 0.025*"book" + 0.023*"charge" + 0.023*"driver" + 0.021*"contact"
Topic 4: 0.045*"day" + 0.031*"train" + 0.027*"week" + 0.026*"time" + 0.023*"virgintrain" + 0.019*"today" + 0.018*"gwrhelp" + 0.018*"swhelp" + 0.017*"late" + 0.016*"go"
Topic 5: 0.083*"flight" + 0.051*"americanair" + 0.033*"britishairway" + 0.026*"ticket" + 0.021*"seat" + 0.020*"hour" + 0.018*"delay" + 0.016*"fl

In [10]:
import pyLDAvis
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Visualize with pyLDAvis: See [2] for more details
pyLDAvis.enable_notebook()
visualization = pyLDAvis.gensim_models.prepare(
    lda_model, 
    corpus,
    dictionary, 
    mds = "mmds", 
    R = 30)

visualization

ModuleNotFoundError: No module named 'pyLDAvis'

In [12]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud

for t in range(lda_model.num_topics):
    plt.figure()
    plt.gca().set_title('Topic ' + str(t))
    plt.imshow(WordCloud(background_color="rgba(255, 255, 255, 0)", mode="RGBA", width=1600, height=800).fit_words(dict(lda_model.show_topic(t, 50))))
    plt.axis("off")
    plt.gcf().savefig(f"./static/wordclouds/topic{t}.png", transparent=True, dpi=300)
    plt.show()


ModuleNotFoundError: No module named 'wordcloud'

In [1]:
# Save models

lda_model.save('./saved_models/LDAmodel')
bigram_mod.save("./saved_models/bigram_mod.pkl")
trigram_mod.save("./saved_models/trigram_mod.pkl")


NameError: name 'lda_model' is not defined