In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import glob 
import os

import re, nltk, spacy, string

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from plotly.offline import plot
import plotly.graph_objects as go
import plotly.express as px

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS



In [10]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' %re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

nlp = spacy.load('en') 

def lemmatizer(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

In [11]:
def column_cleaner(df, header):
    df_clean = pd.DataFrame(df[header].apply(lambda x: clean_text(x)))
    df_clean['{}_lemmatize'.format(header)] = df_clean.apply(lambda x: lemmatizer(x[header]), axis=1)
    df_clean['{}_clean'.format(header)] = df_clean['{}_lemmatize'.format(header)].str.replace('-PRON-', '')
    return df_clean

In [12]:
def distributionOfCharLength(df_clean, header):
    plt.figure(figsize=(10,6))
    doc_lens = [len(d) for d in df_clean[header]]
    plt.hist(doc_lens, bins = 30)
    plt.title('Distribution of {} character length'.format(header))
    plt.ylabel('Number of {}s'.format(header))
    plt.xlabel('{} length'.format(header))
    sns.despine();

In [13]:
def top_words_visual(df_clean, header):
    mpl.rcParams['figure.figsize']=(12.0,12.0)  
    mpl.rcParams['font.size']=12            
    mpl.rcParams['savefig.dpi']=100             
    mpl.rcParams['figure.subplot.bottom']=.1 
    stopwords = set(STOPWORDS)

    wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=500,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(df_clean[header]))

    print(wordcloud)
    fig = plt.figure(1)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show();

In [32]:
customer_success = pd.read_csv("Data/All/CustomerSuccess.csv")

In [35]:
customer_success['prev_title_1'].value_counts()

Customer Success Manager                                                             16
Customer Success Advocate                                                             5
Customer Success                                                                      4
Customer Success Specialist                                                           3
Senior Customer Success Manager                                                       3
                                                                                     ..
Analytics Platform Services Reference Manager, IBM Analtyics, Cloud Data Services     1
Director Customer Experience                                                          1
Technical Support Engineer                                                            1
Client Relations Manager                                                              1
Customer Success Advocate - Cash App                                                  1
Name: prev_title_1, Length: 312,

In [37]:
df_clean = pd.DataFrame(customer_success[customer_success['prev_title_1'].notna()]['prev_title_1'].apply(lambda x: clean_text(x)))
df_clean["prev_title_1_lemmatize"] =  df_clean.apply(lambda x: lemmatizer(x['prev_title_1']), axis=1)
df_clean['prev_title_1_clean'] = df_clean['prev_title_1_lemmatize'].str.replace('-PRON-', '')


In [38]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df_clean['prev_title_1_clean'], 30)
df2 = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

fig = go.Figure([go.Bar(x=df2['unigram'], y=df2['count'])])
fig.update_layout(title=go.layout.Title(text="Top 30 unigrams in the question text after removing stop words and lemmatization"))



In [39]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df_clean['prev_title_1_clean'], 20)
df3 = pd.DataFrame(common_words, columns = ['bigram' , 'count'])

fig = go.Figure([go.Bar(x=df3['bigram'], y=df3['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 bigrams text after removing stop words and lemmatization"))
fig.show()

In [40]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df_clean['prev_title_1'], 20)
df4 = pd.DataFrame(common_words, columns = ['trigram' , 'count'])

fig = go.Figure([go.Bar(x=df4['trigram'], y=df4['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 trigrams in the question text"))
fig.show()

## Description

In [41]:
df_clean['']

Unnamed: 0,prev_title_1,prev_title_1_lemmatize,prev_title_1_clean
0,customer engagement director,customer engagement director,customer engagement director
1,ibm senior project manager,ibm senior project manager,ibm senior project manager
2,tivoli software sales manager west region,tivoli software sale manager west region,tivoli software sale manager west region
3,manager watson expert cognitive architects,manager watson expert cognitive architect,manager watson expert cognitive architect
4,founder managing director,founder manage director,founder manage director
...,...,...,...
343,global head customer success,global head customer success,global head customer success
344,customer success manager global,customer success manager global,customer success manager global
345,head of partner success south india ubereats,head of partner success south india ubereats,head of partner success south india ubereats
346,specialist public and social sector practice,specialist public and social sector practice,specialist public and social sector practice


In [50]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=3,                       
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}',  
                             max_features=5000,          
                            )

data_vectorized = vectorizer.fit_transform(df_clean_1['prev_title_1'])

lda_model = LatentDirichletAllocation(n_components=10, # Number of topics
                                      learning_method='online',
                                      random_state=0,       
                                      n_jobs = -1  # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

In [None]:
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

In [8]:
apple_sw = pd.read_csv('Data/SoftwareEngineer/SoftwareEngineer_Apple.csv')

In [47]:
#df_clean_1 = column_cleaner(customer_success, 'description')
df_clean_1 = column_cleaner(customer_success[customer_success['prev_title_1'].notna()], 'prev_title_1')

In [None]:
df['level'=='senior']['total_years_of_prior_exp' > 6]['has_masters' == False]