### Topic Modeling (LDA)

In [1]:
%matplotlib inline
import pickle
from pprint import pprint
import random
import warnings
import time

# numpy, pandas, matplotlib and regular expressions (data science essentials)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# tqdm
from tqdm import tqdm

# spacy
import spacy
from spacy.lang.en import English
# import en_core_web_sm

# gensim
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

# nltk
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import words
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from nltk.stem import LancasterStemmer

# pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

# styling
pd.set_option('display.max_columns',150)
plt.style.use('bmh')
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')
import logging
logging.raiseExceptions = False

#### Read in the scraped data

In [15]:
df = pd.read_csv("ceo_tweets_final.csv")
df = df.drop(columns=['Unnamed: 0'])

In [16]:
## Choose the tweets after 2016
df["date"] = pd.to_datetime(df['date'])
df = df[df['date'].dt.year>2016]
df["date"]=df["date"].apply(lambda x: x.date())

In [17]:
warnings.simplefilter('ignore')

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sahana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Functions to remove stop words and punctuation, get mentions and hashtags from tweets and removing links and special characters

In [18]:
## Removing stop words, punctuation and tokenizing
stop = stopwords.words('english')
stop = stop + ['rt','amp']

In [19]:
def get_mentions(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        result = re.findall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)", tweet) #(@[A-Za-z0-9]+)|
        return list(set(result))

In [20]:
def get_hashtags(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        result = re.findall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9_]+)", tweet) #(@[A-Za-z0-9]+)|
        return list(set(result))

In [21]:
def clean_tweet_split(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        p = ' '.join(re.sub("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)", " ", tweet).split())
        s = ' '.join(re.sub("(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9_]+)", " ", p).split())
        return ' '.join(re.sub("(\w+:\/\/\S+)", " ", s).split())

In [22]:
def remove_links(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        return ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())

In [23]:
lemma = nltk.wordnet.WordNetLemmatizer()
def lemmatize(text):
    return lemma.lemmatize(text)

#### Functions to get Bigram and corpus for LDA modeling

In [24]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

In [25]:
def get_corpus(df):
    """
    Get Bigram Model, Corpus, id2word mapping
    """
    bigram = bigrams(df.tweet_tokens_lem)
    bigram = [bigram[tweet] for tweet in df.tweet_tokens_lem]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

#### Tokenize and clean the tweets

In [26]:
df["mentions"] = df["tweet"].apply(lambda tweet: get_mentions(tweet))
df["tags"] = df["tweet"].apply(lambda tweet: get_hashtags(tweet))

df["tweet_clean"] = df["tweet"].apply(lambda tweet: clean_tweet_split(tweet))

df["tweet_tokens"] = df["tweet_clean"].apply(lambda each_post: word_tokenize(re.sub(r'[^\w\s]',' ',each_post.lower())))
df["tweet_tokens"] = df["tweet_tokens"].apply(lambda list_of_words: [x for x in list_of_words if x not in stop])

df["tweet_tokens_lem"] = df["tweet_tokens"].apply(lambda list_of_words: [lemmatize(x) for x in list_of_words])

In [27]:
## Removing Less Frequent words
df["tweet_new"] = df["tweet"].apply(lambda tweet: remove_links(tweet))
df["tweet_new"] = df["tweet_new"].apply(lambda each_post: word_tokenize(re.sub(r'[^\w\s]',' ',each_post.lower())))
df["tweet_new"] = df["tweet_new"].apply(lambda list_of_words: [x for x in list_of_words if x not in stop])

### LDA Functions

In [28]:
def lda_analysis(df, username, num_topics):
    df_ceo = df[df['username']== username]
    
    all_words = df_ceo['tweet_new'].sum()
    freq_dist = nltk.FreqDist(all_words)
    df_fdist=pd.DataFrame(list(freq_dist.items()), columns=['term', 'freq'])
    
    df_fdist = df_fdist.sort_values(by = 'freq', ascending = False)
    df_fdist = df_fdist[df_fdist['freq'] > 1]
    
    relevant_words = list(df_fdist['term'])
    
    df_ceo["tweet_new"] = df_ceo["tweet_new"].apply(lambda list_of_words: [x for x in list_of_words if x in relevant_words])
    df_ceo["tweet_tokens_lem"] = df_ceo["tweet_new"].apply(lambda list_of_words: [lemmatize(x) for x in list_of_words])
    
    train_corpus, train_id2word, bigram_train = get_corpus(df_ceo)
    
    import logging
    logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda_train = gensim.models.ldamulticore.LdaMulticore(
                               corpus=train_corpus,
                               num_topics=num_topics,
                               id2word=train_id2word,
                               chunksize=100,
                               workers=7, # Num. Processing Cores - 1
                               passes=50,
                               eval_every = 1,
                               per_word_topics=True,
                               random_state=11)
        lda_train.save('lda_train.model')
        
    coherence_model_lda = CoherenceModel(model=lda_train, texts=bigram_train, dictionary=train_id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print("The coherence of the LDA model is",coherence_lda)
    
    train_vecs = []
    for i in range(len(df_ceo.tweet_new)):
        top_topics = lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(num_topics)]
        train_vecs.append(topic_vec)
    
    return df_ceo, lda_train.print_topics(), train_vecs, num_topics

In [29]:
def get_max_topics(values):
    topics = []
    if len(list(set(values))) == 1:
        topics = values     
    else:
        topics.append(max(values))
    
    return topics

def assign_topics(col1, col2):
    if col1 in col2:
        return 1
    else:
        return 0

#### Tim Cook - Apple

In [30]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@tim_cook', 8)
lda_results

The coherence of the LDA model is 0.3803925329617725


[(0,
  '0.103*"year" + 0.076*"see" + 0.072*"developer" + 0.058*"world" + 0.054*"time" + 0.045*"week" + 0.038*"ago" + 0.036*"app" + 0.035*"powerful" + 0.034*"today"'),
 (1,
  '0.156*"heart" + 0.085*"community" + 0.079*"family" + 0.078*"one" + 0.046*"today" + 0.046*"victim" + 0.046*"affected" + 0.043*"pro" + 0.042*"violence" + 0.041*"ipad"'),
 (2,
  '0.180*"u" + 0.060*"make" + 0.059*"every" + 0.053*"celebrate" + 0.050*"life" + 0.049*"congratulation" + 0.044*"let" + 0.040*"day" + 0.037*"today" + 0.036*"people"'),
 (3,
  '0.170*"thank" + 0.148*"work" + 0.116*"proud" + 0.090*"team" + 0.072*"great" + 0.050*"friend" + 0.041*"visit" + 0.038*"back" + 0.037*"th" + 0.034*"help"'),
 (4,
  '0.103*"woman" + 0.080*"apple" + 0.075*"never" + 0.065*"story" + 0.064*"country" + 0.062*"right" + 0.058*"men" + 0.058*"enjoy" + 0.057*"like" + 0.054*"place"'),
 (5,
  '0.208*"thanks" + 0.123*"iphone" + 0.102*"new" + 0.076*"love" + 0.058*"student" + 0.047*"forward" + 0.043*"thing" + 0.041*"shotoniphone" + 0.033*"

In [31]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Technology','Social','People','Gratitude','Women Appreciation','Product','Store Launch','Emotion']

In [32]:
df_tim_cook = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)

In [33]:
df_tim_cook['all_topics']= df_tim_cook[['Technology','Social','People','Gratitude','Women Appreciation','Product','Store Launch','Emotion']].values.tolist()
df_tim_cook['max_topics'] = df_tim_cook['all_topics'].apply(lambda values: get_max_topics(values))

In [34]:
df_tim_cook['Technology'] = df_tim_cook.apply(lambda x: assign_topics(x['Technology'], x['max_topics']), axis=1)
df_tim_cook['Social'] = df_tim_cook.apply(lambda x: assign_topics(x['Social'], x['max_topics']), axis=1)
df_tim_cook['People'] = df_tim_cook.apply(lambda x: assign_topics(x['People'], x['max_topics']), axis=1)
df_tim_cook['Gratitude'] = df_tim_cook.apply(lambda x: assign_topics(x['Gratitude'], x['max_topics']), axis=1)
df_tim_cook['Women Appreciation'] = df_tim_cook.apply(lambda x: assign_topics(x['Women Appreciation'], x['max_topics']), axis=1)
df_tim_cook['Product'] = df_tim_cook.apply(lambda x: assign_topics(x['Product'], x['max_topics']), axis=1)
df_tim_cook['Store Launch'] = df_tim_cook.apply(lambda x: assign_topics(x['Store Launch'], x['max_topics']), axis=1)
df_tim_cook['Emotion'] = df_tim_cook.apply(lambda x: assign_topics(x['Emotion'], x['max_topics']), axis=1)

In [36]:
average_topic_weights = df_tim_cook[['Technology','Social','People','Gratitude','Women Appreciation','Product','Store Launch','Emotion']].sum(axis=0)
average_topic_weights

Technology            117
Social                 95
People                113
Gratitude             102
Women Appreciation     86
Product                93
Store Launch          104
Emotion               124
dtype: int64

In [37]:
ceo_topics = pd.DataFrame(average_topic_weights)
ceo_topics = ceo_topics.reset_index()
ceo_topics

Unnamed: 0,index,0
0,Technology,117
1,Social,95
2,People,113
3,Gratitude,102
4,Women Appreciation,86
5,Product,93
6,Store Launch,104
7,Emotion,124


In [38]:
ceo_topics['index'] = ceo_topics['index'].apply(lambda x:"Products and Services" if any(y in x.lower() for y in ["product","store launch"]) else x)

In [39]:
import chart_studio.plotly as py
py.plotly.tools.set_credentials_file(username='sah_lumos', api_key='9fCFTwIksEv3WNQFIZSL')

import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *
from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Tim Cook')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Hashtag analysis

In [43]:
hashtags = df_tim_cook['tags'].apply(pd.Series).stack()

hashtags_df= pd.DataFrame(hashtags)
hashtags_df.columns=['hashtags']

In [44]:
hashtags_df = pd.DataFrame(hashtags_df['hashtags'].value_counts()).reset_index()

In [45]:
hashtags_df['index'] = hashtags_df['index'].apply(lambda x:"Holiday Celebration" if any(y in x.lower() for y in ["easter","day","diwali","july","month","year","thanksgiving","week"]) else x)
hashtags_df['index'] = hashtags_df['index'].apply(lambda x:"Apple" if any(y in x.lower() for y in ["apple","airpod","iphone","ipad","potrait"]) else x)

In [46]:
hashtags_df = pd.DataFrame(hashtags_df["index"].value_counts()).reset_index()
hashtags_df.columns= ["hashtags","count"]

In [47]:
(hashtags_df[hashtags_df["hashtags"] == 'Holiday Celebration']["count"].sum()/hashtags_df["count"].sum())*100

38.613861386138616

In [48]:
(hashtags_df[hashtags_df["hashtags"] == 'Apple']["count"].sum()/hashtags_df["count"].sum())*100

11.881188118811881

#### Bill Gates - Bill Gates and Melinda Gates Foundation

In [51]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@BillGates', 6)
lda_results

The coherence of the LDA model is 0.38945760812188884


[(0,
  '0.157*"one" + 0.077*"book" + 0.054*"year" + 0.053*"lot" + 0.029*"recently" + 0.029*"read" + 0.027*"favorite" + 0.025*"people" + 0.023*"great" + 0.023*"never"'),
 (1,
  '0.058*"need" + 0.047*"vaccine" + 0.035*"alzheimer" + 0.033*"excited" + 0.033*"ever" + 0.033*"world" + 0.031*"new" + 0.030*"disease" + 0.030*"government" + 0.028*"look"'),
 (2,
  '0.057*"melinda" + 0.048*"life" + 0.047*"work" + 0.047*"great" + 0.034*"new" + 0.034*"best" + 0.029*"day" + 0.029*"student" + 0.027*"learn" + 0.026*"every"'),
 (3,
  '0.050*"like" + 0.049*"time" + 0.041*"warrenbuffett" + 0.040*"melindagates" + 0.033*"future" + 0.030*"know" + 0.028*"new" + 0.028*"think" + 0.028*"india" + 0.027*"give"'),
 (4,
  '0.103*"world" + 0.072*"progress" + 0.047*"see" + 0.046*"health" + 0.035*"making" + 0.033*"child" + 0.033*"incredible" + 0.032*"global" + 0.030*"u" + 0.026*"life"'),
 (5,
  '0.097*"people" + 0.057*"help" + 0.045*"today" + 0.039*"world" + 0.038*"energy" + 0.031*"thing" + 0.030*"make" + 0.030*"get" + 

In [52]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Book Recommendations','Diseases/Vaccines','Education','Warren Buffet','General World Issues','Renewable Energy']

In [53]:
df_bill_gates = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_bill_gates['all_topics']= df_bill_gates[['Book Recommendations','Diseases/Vaccines','Education','Warren Buffet','General World Issues','Renewable Energy']].values.tolist()
df_bill_gates['max_topics'] = df_bill_gates['all_topics'].apply(lambda values: get_max_topics(values))

df_bill_gates['Book Recommendations'] = df_bill_gates.apply(lambda x: assign_topics(x['Book Recommendations'], x['max_topics']), axis=1)
df_bill_gates['Diseases/Vaccines'] = df_bill_gates.apply(lambda x: assign_topics(x['Diseases/Vaccines'], x['max_topics']), axis=1)
df_bill_gates['Education'] = df_bill_gates.apply(lambda x: assign_topics(x['Education'], x['max_topics']), axis=1)
df_bill_gates['Warren Buffet'] = df_bill_gates.apply(lambda x: assign_topics(x['Warren Buffet'], x['max_topics']), axis=1)
df_bill_gates['General World Issues'] = df_bill_gates.apply(lambda x: assign_topics(x['General World Issues'], x['max_topics']), axis=1)
df_bill_gates['Renewable Energy'] = df_bill_gates.apply(lambda x: assign_topics(x['Renewable Energy'], x['max_topics']), axis=1)

In [54]:
average_topic_weights = df_bill_gates[['Book Recommendations','Diseases/Vaccines','Education','Warren Buffet','General World Issues','Renewable Energy']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [55]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics.index, values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Bill Gates')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Elon Musk - Tesla

In [56]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@elonmusk', 15)

The coherence of the LDA model is 0.49431548649757723


In [57]:
lda_results

[(0,
  '0.229*"erdayastronaut" + 0.064*"martinengwicht" + 0.054*"spacex" + 0.054*"john_gardi" + 0.050*"engine" + 0.037*"13ericralph31" + 0.033*"raptor" + 0.025*"janeidyeve" + 0.024*"djsnm" + 0.020*"spexcast"'),
 (1,
  '0.115*"tesla" + 0.091*"great" + 0.068*"sure" + 0.061*"work" + 0.053*"mode" + 0.044*"robotbeat" + 0.042*"team" + 0.038*"need" + 0.029*"safety" + 0.024*"engineering"'),
 (2,
  '0.185*"spacex" + 0.082*"launch" + 0.066*"starship" + 0.059*"first" + 0.044*"test" + 0.036*"flight" + 0.028*"orbit" + 0.025*"falcon_9" + 0.021*"texas" + 0.020*"fire"'),
 (3,
  '0.111*"like" + 0.078*"true" + 0.062*"tunnel" + 0.062*"would" + 0.058*"teslaownerssv" + 0.049*"look" + 0.043*"speed" + 0.035*"well" + 0.035*"steel" + 0.031*"people"'),
 (4,
  '0.188*"flcnhvy" + 0.108*"good" + 0.068*"erdayastronaut" + 0.051*"make" + 0.049*"worldandscience" + 0.035*"use" + 0.035*"000" + 0.031*"keego73" + 0.025*"since" + 0.025*"harrystoltz1"'),
 (5,
  '0.222*"tesla" + 0.064*"cleantechnica" + 0.043*"model" + 0.038*

In [58]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['spacex1','Tesla','spacex2','Boring Company','tesla1','Clean Energy Initiatives','tesla2','Social','spacex3','tesla3','tesla4','tesla5','tesla6','clean','Upcoming Events']

In [59]:
df_elon_musk = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)

In [60]:
df_elon_musk['all_topics']= df_elon_musk[['spacex1','Tesla','spacex2','Boring Company','tesla1','Clean Energy Initiatives','tesla2','Social','spacex3','tesla3','tesla4','tesla5','tesla6','clean','Upcoming Events']].values.tolist()
df_elon_musk['max_topics'] = df_elon_musk['all_topics'].apply(lambda values: get_max_topics(values))

In [61]:
df_elon_musk['spacex1'] = df_elon_musk.apply(lambda x: assign_topics(x['spacex1'], x['max_topics']), axis=1)
df_elon_musk['Tesla'] = df_elon_musk.apply(lambda x: assign_topics(x['Tesla'], x['max_topics']), axis=1)
df_elon_musk['spacex2'] = df_elon_musk.apply(lambda x: assign_topics(x['spacex2'], x['max_topics']), axis=1)
df_elon_musk['Boring Company'] = df_elon_musk.apply(lambda x: assign_topics(x['Boring Company'], x['max_topics']), axis=1)
df_elon_musk['tesla1'] = df_elon_musk.apply(lambda x: assign_topics(x['tesla1'], x['max_topics']), axis=1)
df_elon_musk['Clean Energy Initiatives'] = df_elon_musk.apply(lambda x: assign_topics(x['Clean Energy Initiatives'], x['max_topics']), axis=1)
df_elon_musk['tesla2'] = df_elon_musk.apply(lambda x: assign_topics(x['tesla2'], x['max_topics']), axis=1)
df_elon_musk['Social'] = df_elon_musk.apply(lambda x: assign_topics(x['Social'], x['max_topics']), axis=1)
df_elon_musk['spacex3'] = df_elon_musk.apply(lambda x: assign_topics(x['spacex3'], x['max_topics']), axis=1)
df_elon_musk['tesla3'] = df_elon_musk.apply(lambda x: assign_topics(x['tesla3'], x['max_topics']), axis=1)
df_elon_musk['tesla4'] = df_elon_musk.apply(lambda x: assign_topics(x['tesla4'], x['max_topics']), axis=1)
df_elon_musk['tesla5'] = df_elon_musk.apply(lambda x: assign_topics(x['tesla5'], x['max_topics']), axis=1)
df_elon_musk['tesla6'] = df_elon_musk.apply(lambda x: assign_topics(x['tesla6'], x['max_topics']), axis=1)
df_elon_musk['clean'] = df_elon_musk.apply(lambda x: assign_topics(x['clean'], x['max_topics']), axis=1)
df_elon_musk['Upcoming Events'] = df_elon_musk.apply(lambda x: assign_topics(x['Upcoming Events'], x['max_topics']), axis=1)

In [62]:
average_topic_weights = df_elon_musk[['spacex1','Tesla','spacex2','Boring Company','tesla1','Clean Energy Initiatives','tesla2','Social','spacex3','tesla3','tesla4','tesla5','tesla6','clean','Upcoming Events']].sum(axis=0)

In [63]:
ceo_topics = pd.DataFrame(average_topic_weights)
ceo_topics = ceo_topics.reset_index()

In [64]:
ceo_topics['index'] = ceo_topics['index'].apply(lambda x:"Tesla" if any(y in x.lower() for y in ["tesla1","tesla2","tesla3","tesla4","tesla5","tesla6"]) else x)
ceo_topics['index'] = ceo_topics['index'].apply(lambda x:"SpaceX" if any(y in x.lower() for y in ["spacex1","spacex2","spacex3"]) else x)
ceo_topics['index'] = ceo_topics['index'].apply(lambda x:"SpaceX" if any(y in x.lower() for y in ["clean"]) else x)

In [65]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Elon Musk')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Richard Branson - Virgin

In [73]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@richardbranson', 7)
lda_results

The coherence of the LDA model is 0.3823749189697216


[(0,
  '0.057*"first" + 0.046*"great" + 0.033*"time" + 0.029*"virginatlantic" + 0.023*"wonderful" + 0.019*"air" + 0.018*"hollybranson" + 0.018*"day" + 0.017*"back" + 0.015*"ever"'),
 (1,
  '0.059*"virginvoyages" + 0.034*"year" + 0.028*"one" + 0.022*"travel" + 0.020*"new" + 0.019*"take" + 0.019*"spaceshiptwo" + 0.015*"world" + 0.014*"place" + 0.013*"million"'),
 (2,
  '0.032*"love" + 0.029*"new" + 0.024*"need" + 0.023*"always" + 0.020*"show" + 0.018*"people" + 0.017*"would" + 0.017*"business" + 0.017*"challenge" + 0.016*"thought"'),
 (3,
  '0.033*"business" + 0.031*"people" + 0.018*"one" + 0.017*"idea" + 0.017*"oceanunite" + 0.016*"help" + 0.016*"world" + 0.016*"great" + 0.015*"entrepreneur" + 0.015*"way"'),
 (4,
  '0.050*"thanks" + 0.037*"virgin" + 0.034*"work" + 0.028*"really" + 0.021*"many" + 0.018*"u" + 0.017*"wonderful" + 0.017*"hope" + 0.016*"best" + 0.016*"people"'),
 (5,
  '0.065*"team" + 0.047*"virginfamily" + 0.035*"see" + 0.034*"virgingalactic" + 0.032*"much" + 0.022*"virgin"

In [74]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Gratitude','Virgin Voyages','Employee Appreciation','Social','gratitude2','Team','Virgin Galactic']

In [75]:
df_richard_bran = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_richard_bran['all_topics']= df_richard_bran[['Gratitude','Virgin Voyages','Employee Appreciation','Social','gratitude2','Team','Virgin Galactic']].values.tolist()
df_richard_bran['max_topics'] = df_richard_bran['all_topics'].apply(lambda values: get_max_topics(values))

df_richard_bran['Gratitude'] = df_richard_bran.apply(lambda x: assign_topics(x['Gratitude'], x['max_topics']), axis=1)
df_richard_bran['Virgin Voyages'] = df_richard_bran.apply(lambda x: assign_topics(x['Virgin Voyages'], x['max_topics']), axis=1)
df_richard_bran['Employee Appreciation'] = df_richard_bran.apply(lambda x: assign_topics(x['Employee Appreciation'], x['max_topics']), axis=1)
df_richard_bran['Social'] = df_richard_bran.apply(lambda x: assign_topics(x['Social'], x['max_topics']), axis=1)
df_richard_bran['gratitude2'] = df_richard_bran.apply(lambda x: assign_topics(x['gratitude2'], x['max_topics']), axis=1)
df_richard_bran['Team'] = df_richard_bran.apply(lambda x: assign_topics(x['Team'], x['max_topics']), axis=1)
df_richard_bran['Virgin Galactic'] = df_richard_bran.apply(lambda x: assign_topics(x['Virgin Galactic'], x['max_topics']), axis=1)

In [76]:
average_topic_weights = df_richard_bran[['Gratitude','Virgin Voyages','Employee Appreciation','Social','gratitude2','Team','Virgin Galactic']].sum(axis=0)

In [77]:
ceo_topics = pd.DataFrame(average_topic_weights)
ceo_topics = ceo_topics.reset_index()

In [78]:
ceo_topics['index'] = ceo_topics['index'].apply(lambda x:"Gratitude" if any(y in x.lower() for y in ["gratitude","gratitude2"]) else x)
ceo_topics['index'] = ceo_topics['index'].apply(lambda x:"Team" if any(y in x.lower() for y in ["team","employee appreciation"]) else x)

In [80]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Richard Branson')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Brian Chesky - AirBnb

In [81]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@bchesky', 4)
lda_results

The coherence of the LDA model is 0.4528958594505259


[(0,
  '0.263*"airbnb" + 0.076*"new" + 0.058*"today" + 0.043*"idea" + 0.042*"first" + 0.037*"also" + 0.032*"nathanblec" + 0.032*"yes" + 0.031*"thing" + 0.030*"excited"'),
 (1,
  '0.155*"home" + 0.097*"airbnb" + 0.077*"one" + 0.076*"thank" + 0.071*"people" + 0.068*"need" + 0.047*"housing" + 0.046*"year" + 0.044*"last" + 0.040*"benedictevans"'),
 (2,
  '0.149*"host" + 0.142*"airbnb" + 0.080*"experience" + 0.069*"000" + 0.059*"guest" + 0.052*"u" + 0.041*"city" + 0.039*"many" + 0.036*"1" + 0.035*"year"'),
 (3,
  '0.124*"thanks" + 0.080*"airbnb" + 0.075*"want" + 0.069*"like" + 0.062*"would" + 0.059*"team" + 0.047*"2" + 0.047*"time" + 0.042*"community" + 0.041*"part"')]

In [82]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Products','Services','Social','Appreciation']

In [83]:
df_bchesky = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_bchesky['all_topics']= df_bchesky[['Products','Services','Social','Appreciation']].values.tolist()
df_bchesky['max_topics'] = df_bchesky['all_topics'].apply(lambda values: get_max_topics(values))

df_bchesky['Products'] = df_bchesky.apply(lambda x: assign_topics(x['Products'], x['max_topics']), axis=1)
df_bchesky['Services'] = df_bchesky.apply(lambda x: assign_topics(x['Services'], x['max_topics']), axis=1)
df_bchesky['Social'] = df_bchesky.apply(lambda x: assign_topics(x['Social'], x['max_topics']), axis=1)
df_bchesky['Appreciation'] = df_bchesky.apply(lambda x: assign_topics(x['Appreciation'], x['max_topics']), axis=1)

In [84]:
average_topic_weights = df_bchesky[['Products','Services','Social','Appreciation']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

#### Hashtags and Mentions

In [85]:
hashtags = df_bchesky['tags'].apply(pd.Series).stack()

hashtags_df= pd.DataFrame(hashtags)
hashtags_df.columns=['hashtags']

In [86]:
pd.DataFrame(hashtags_df['hashtags'].value_counts()).head()

Unnamed: 0,hashtags
weaccept,5
Airbnb,4
HurricaneIrma,4
WeAccept,3
DACA,3


In [87]:
mentions = df_bchesky['mentions'].apply(pd.Series).stack()

mentions_df= pd.DataFrame(mentions)
mentions_df.columns=['mentions']

In [88]:
mentions_df = pd.DataFrame(mentions_df['mentions'].value_counts()).reset_index()

In [89]:
mentions_df['index'] = mentions_df['index'].apply(lambda x: "Airbnb" if any(y in x.lower() for y in ["Airbnb","airbnb"]) else x)

In [90]:
mentions_df.columns= ["mentions","count"]

In [91]:
mentions_df.head()

Unnamed: 0,mentions,count
0,Airbnb,81
1,jgebbia,18
2,bchesky,11
3,nathanblec,10
4,BenedictEvans,10


In [92]:
(mentions_df[mentions_df["mentions"] == 'Airbnb']["count"].sum()/mentions_df["count"].sum())*100

15.350223546944857

#### Marc Benioff - SalesForce

In [97]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@Benioff', 6)

The coherence of the LDA model is 0.3379418862473504


In [98]:
lda_results

[(0,
  '0.083*"benioff" + 0.054*"salesforce" + 0.047*"trailblazer" + 0.025*"trailblazerbook" + 0.020*"book" + 0.019*"1" + 0.019*"value" + 0.018*"business" + 0.017*"ucsf" + 0.016*"dreamforce"'),
 (1,
  '0.134*"time" + 0.024*"time_100" + 0.023*"time100" + 0.023*"cover" + 0.023*"summit" + 0.022*"one" + 0.021*"live" + 0.020*"new" + 0.017*"people" + 0.013*"world"'),
 (2,
  '0.070*"salesforce" + 0.031*"trailhead" + 0.029*"homelessness" + 0.020*"df19" + 0.015*"future" + 0.015*"know" + 0.014*"proud" + 0.014*"mkushel" + 0.014*"part" + 0.013*"dreamforce"'),
 (3,
  '0.058*"benioff" + 0.031*"amazing" + 0.030*"san_francisco" + 0.021*"thank" + 0.017*"yoshikiofficial" + 0.017*"great" + 0.016*"friend" + 0.015*"people" + 0.014*"salesforce_tower" + 0.014*"always"'),
 (4,
  '0.042*"salesforce" + 0.031*"new" + 0.023*"need" + 0.021*"benioff" + 0.017*"million" + 0.016*"school" + 0.015*"go" + 0.013*"facebook" + 0.010*"today" + 0.010*"2"'),
 (5,
  '0.032*"ocean" + 0.030*"u" + 0.026*"world" + 0.024*"make" + 0.

In [99]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Author','Time','Company Roadmap','Social','Education','Ocean Conservation']

In [100]:
df_benioff = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_benioff['all_topics']= df_benioff[['Author','Time','Company Roadmap','Social','Education','Ocean Conservation']].values.tolist()
df_benioff['max_topics'] = df_benioff['all_topics'].apply(lambda values: get_max_topics(values))

In [101]:
df_benioff['Author'] = df_benioff.apply(lambda x: assign_topics(x['Author'], x['max_topics']), axis=1)
df_benioff['Time'] = df_benioff.apply(lambda x: assign_topics(x['Time'], x['max_topics']), axis=1)
df_benioff['Company Roadmap'] = df_benioff.apply(lambda x: assign_topics(x['Company Roadmap'], x['max_topics']), axis=1)
df_benioff['Social'] = df_benioff.apply(lambda x: assign_topics(x['Social'], x['max_topics']), axis=1)
df_benioff['Education'] = df_benioff.apply(lambda x: assign_topics(x['Education'], x['max_topics']), axis=1)
df_benioff['Ocean Conservation'] = df_benioff.apply(lambda x: assign_topics(x['Ocean Conservation'], x['max_topics']), axis=1)

In [102]:
average_topic_weights = df_benioff[['Author','Time','Company Roadmap','Social','Education','Ocean Conservation']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [105]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics.index, values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Benioff')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Bill Gross - IdeaLab

In [106]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@Bill_Gross', 6)

The coherence of the LDA model is 0.37085428057221376


In [107]:
lda_results

[(0,
  '0.074*"yes" + 0.059*"thanks" + 0.053*"one" + 0.041*"smile" + 0.029*"great" + 0.023*"think" + 0.022*"even" + 0.021*"amazing" + 0.020*"wow" + 0.019*"pretty"'),
 (1,
  '0.061*"world" + 0.052*"amazon" + 0.051*"look" + 0.044*"amazing" + 0.041*"like" + 0.038*"would" + 0.033*"new" + 0.028*"china" + 0.026*"love" + 0.024*"company"'),
 (2,
  '0.057*"people" + 0.047*"thing" + 0.039*"ted2017" + 0.039*"talk" + 0.038*"great" + 0.035*"company" + 0.032*"see" + 0.029*"think" + 0.027*"way" + 0.022*"power"'),
 (3,
  '0.033*"1" + 0.032*"time" + 0.031*"new" + 0.031*"year" + 0.030*"energy" + 0.023*"technology" + 0.021*"word" + 0.021*"cost" + 0.019*"person" + 0.018*"day"'),
 (4,
  '0.064*"year" + 0.060*"davos" + 0.035*"wef17" + 0.029*"trust" + 0.028*"first" + 0.028*"10" + 0.025*"3" + 0.021*"really" + 0.020*"last" + 0.020*"change"'),
 (5,
  '0.063*"ai" + 0.035*"human" + 0.031*"car" + 0.029*"picture" + 0.024*"make" + 0.023*"ted2017" + 0.023*"say" + 0.022*"robot" + 0.021*"u" + 0.020*"idea"')]

In [108]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Gratitude', 'Appreciation', 'Company', 'Transportation',
                      'Renewable Energy', 'Artificial Intelligence' ]

In [109]:
df_bill_gross = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_bill_gross['all_topics']= df_bill_gross[['Gratitude', 'Appreciation', 'Company', 'Transportation',
                      'Renewable Energy', 'Artificial Intelligence' ]].values.tolist()
df_bill_gross['max_topics'] = df_bill_gross['all_topics'].apply(lambda values: get_max_topics(values))

df_bill_gross['Gratitude'] = df_bill_gross.apply(lambda x: assign_topics(x['Gratitude'], x['max_topics']), axis=1)
df_bill_gross['Appreciation'] = df_bill_gross.apply(lambda x: assign_topics(x['Appreciation'], x['max_topics']), axis=1)
df_bill_gross['Company'] = df_bill_gross.apply(lambda x: assign_topics(x['Company'], x['max_topics']), axis=1)
df_bill_gross['Transportation'] = df_bill_gross.apply(lambda x: assign_topics(x['Transportation'], x['max_topics']), axis=1)
df_bill_gross['Renewable Energy'] = df_bill_gross.apply(lambda x: assign_topics(x['Renewable Energy'], x['max_topics']), axis=1)
df_bill_gross['Artificial Intelligence'] = df_bill_gross.apply(lambda x: assign_topics(x['Artificial Intelligence'], x['max_topics']), axis=1)


In [111]:
average_topic_weights = df_bill_gross[['Gratitude', 'Appreciation', 'Company', 'Transportation',
                      'Renewable Energy', 'Artificial Intelligence']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [113]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics.index, values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Bill Gross')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Dennis Muilenburg - Boeing

In [114]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@BoeingCEO', 4)

The coherence of the LDA model is 0.40265589348117425


In [115]:
lda_results

[(0,
  '0.118*"great" + 0.084*"work" + 0.079*"future" + 0.072*"today" + 0.068*"team" + 0.042*"space" + 0.037*"air" + 0.036*"safety" + 0.032*"see" + 0.029*"enjoyed"'),
 (1,
  '0.099*"first" + 0.072*"team" + 0.048*"looking" + 0.047*"flight" + 0.047*"737_max" + 0.041*"congratulation" + 0.039*"honor" + 0.038*"forward" + 0.036*"test" + 0.036*"week"'),
 (2,
  '0.141*"thanks" + 0.089*"team" + 0.075*"new" + 0.058*"year" + 0.042*"customer" + 0.040*"day" + 0.039*"well" + 0.036*"member" + 0.032*"innovation" + 0.029*"many"'),
 (3,
  '0.083*"proud" + 0.069*"support" + 0.060*"service" + 0.057*"team" + 0.044*"work" + 0.042*"boeingdefense" + 0.039*"day" + 0.038*"life" + 0.038*"best" + 0.036*"one"')]

In [116]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Company roadmap','Award','Team/Innovation','Product']

In [117]:
df_boeing = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_boeing['all_topics']= df_boeing[['Company roadmap','Award','Team/Innovation','Product']].values.tolist()
df_boeing['max_topics'] = df_boeing['all_topics'].apply(lambda values: get_max_topics(values))

df_boeing['Company roadmap'] = df_boeing.apply(lambda x: assign_topics(x['Company roadmap'], x['max_topics']), axis=1)
df_boeing['Award'] = df_boeing.apply(lambda x: assign_topics(x['Award'], x['max_topics']), axis=1)
df_boeing['Team/Innovation'] = df_boeing.apply(lambda x: assign_topics(x['Team/Innovation'], x['max_topics']), axis=1)
df_boeing['Product'] = df_boeing.apply(lambda x: assign_topics(x['Product'], x['max_topics']), axis=1)

In [118]:
average_topic_weights = df_boeing[['Company roadmap','Award','Team/Innovation','Product']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [120]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics.index, values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Dennis Muilenburg')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### John Legere - TMobile

In [121]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@JohnLegere', 6)

The coherence of the LDA model is 0.4430009671319855


In [122]:
lda_results

[(0,
  '0.518*"tmobile" + 0.057*"welcome" + 0.037*"great" + 0.037*"look" + 0.026*"nice" + 0.026*"tmobilehelp" + 0.026*"woo_hoo" + 0.022*"sievertmike" + 0.019*"good" + 0.019*"thanks"'),
 (1,
  '0.062*"get" + 0.048*"slowcookersunday" + 0.033*"day" + 0.029*"live" + 0.029*"yes" + 0.028*"let" + 0.026*"go" + 0.023*"congrats" + 0.022*"happy" + 0.022*"time"'),
 (2,
  '0.073*"enjoy" + 0.039*"jonfreier" + 0.036*"better" + 0.033*"think" + 0.033*"would" + 0.028*"always" + 0.026*"summer" + 0.026*"could" + 0.026*"make" + 0.026*"need"'),
 (3,
  '0.043*"year" + 0.039*"love" + 0.030*"good" + 0.029*"today" + 0.028*"team" + 0.026*"nevilleray" + 0.025*"help" + 0.023*"back" + 0.022*"well" + 0.022*"happy_birthday"'),
 (4,
  '0.068*"magenta" + 0.068*"one" + 0.050*"best" + 0.050*"new" + 0.032*"favorite" + 0.030*"way" + 0.028*"know" + 0.025*"amazing" + 0.024*"love" + 0.024*"really"'),
 (5,
  '0.133*"tmobile" + 0.105*"verizon" + 0.080*"att" + 0.055*"customer" + 0.051*"thank" + 0.044*"mobile" + 0.034*"com" + 0.0

In [123]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Business','Cooking','Team','Throwback','Company Image', 'Mocking Competition']

In [124]:
df_legere = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_legere['all_topics']= df_legere[['Business','Cooking','Team','Throwback', 'Company Image', 'Mocking Competition']].values.tolist()
df_legere['max_topics'] = df_legere['all_topics'].apply(lambda values: get_max_topics(values))

df_legere['Business'] = df_legere.apply(lambda x: assign_topics(x['Business'], x['max_topics']), axis=1)
df_legere['Cooking'] = df_legere.apply(lambda x: assign_topics(x['Cooking'], x['max_topics']), axis=1)
df_legere['Team'] = df_legere.apply(lambda x: assign_topics(x['Team'], x['max_topics']), axis=1)
df_legere['Throwback'] = df_legere.apply(lambda x: assign_topics(x['Throwback'], x['max_topics']), axis=1)
df_legere['Company image'] = df_legere.apply(lambda x: assign_topics(x['Company Image'], x['max_topics']), axis=1)
df_legere['Mocking Competition'] = df_legere.apply(lambda x: assign_topics(x['Mocking Competition'], x['max_topics']), axis=1)

In [126]:
average_topic_weights = df_legere[['Business','Cooking','Team','Throwback', 'Company Image', 'Mocking Competition']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [127]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors

topics_pie = go.Pie(labels=ceo_topics.index, values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for John Legere')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Sundar Pichai - Google

In [128]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@sundarpichai', 7)
lda_results

The coherence of the LDA model is 0.4519016178973097


[(0,
  '0.206*"youtube" + 0.182*"go" + 0.158*"android" + 0.158*"story" + 0.134*"susanwojcicki" + 0.080*"today" + 0.014*"learning" + 0.004*"like" + 0.003*"new" + 0.002*"proud"'),
 (1,
  '0.204*"excited" + 0.128*"working" + 0.111*"forward" + 0.094*"search" + 0.092*"day" + 0.081*"first" + 0.080*"today" + 0.079*"look" + 0.027*"time" + 0.019*"thanks"'),
 (2,
  '0.163*"year" + 0.115*"today" + 0.104*"help" + 0.082*"machine" + 0.082*"learning" + 0.076*"world" + 0.063*"effort" + 0.063*"million" + 0.056*"first" + 0.052*"people"'),
 (3,
  '0.628*"google" + 0.051*"making" + 0.047*"help" + 0.045*"thank" + 0.045*"today" + 0.040*"u" + 0.038*"new" + 0.034*"happy" + 0.025*"ai" + 0.007*"great"'),
 (4,
  '0.208*"great" + 0.144*"team" + 0.135*"live" + 0.131*"u" + 0.083*"today" + 0.079*"amazing" + 0.067*"look" + 0.063*"like" + 0.026*"time" + 0.018*"congrats"'),
 (5,
  '0.202*"thanks" + 0.187*"support" + 0.100*"thank" + 0.096*"everyone" + 0.090*"proud" + 0.073*"happy" + 0.068*"one" + 0.055*"w" + 0.040*"time

In [129]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['YouTube','New Ventures','Machine Learning','Social','Employee Appreciation','Gratitude','Products']

In [130]:
df_sundar_pichai = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_sundar_pichai['all_topics']= df_sundar_pichai[['YouTube','New Ventures','Machine Learning','Social','Employee Appreciation','Gratitude','Products']].values.tolist()
df_sundar_pichai['max_topics'] = df_sundar_pichai['all_topics'].apply(lambda values: get_max_topics(values))

df_sundar_pichai['YouTube'] = df_sundar_pichai.apply(lambda x: assign_topics(x['YouTube'], x['max_topics']), axis=1)
df_sundar_pichai['New Ventures'] = df_sundar_pichai.apply(lambda x: assign_topics(x['New Ventures'], x['max_topics']), axis=1)
df_sundar_pichai['Machine Learning'] = df_sundar_pichai.apply(lambda x: assign_topics(x['Machine Learning'], x['max_topics']), axis=1)
df_sundar_pichai['Social'] = df_sundar_pichai.apply(lambda x: assign_topics(x['Social'], x['max_topics']), axis=1)
df_sundar_pichai['Employee Appreciation'] = df_sundar_pichai.apply(lambda x: assign_topics(x['Employee Appreciation'], x['max_topics']), axis=1)
df_sundar_pichai['Gratitude'] = df_sundar_pichai.apply(lambda x: assign_topics(x['Gratitude'], x['max_topics']), axis=1)
df_sundar_pichai['Products'] = df_sundar_pichai.apply(lambda x: assign_topics(x['Products'], x['max_topics']), axis=1)

In [131]:
average_topic_weights = df_sundar_pichai[['YouTube','New Ventures','Machine Learning','Social','Employee Appreciation','Gratitude','Products']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)
ceo_topics = ceo_topics.reset_index()

In [132]:
ceo_topics['index'] = ceo_topics['index'].apply(lambda x:"Products and Services" if any(y in x.lower() for y in ["new ventures","products"]) else x)

In [133]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors


topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Sundar Pichai')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Aaron Levie - Box

In [134]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@levie', 4)
lda_results

The coherence of the LDA model is 0.4643821278187853


[(0,
  '0.170*"new" + 0.099*"get" + 0.078*"work" + 0.072*"day" + 0.072*"know" + 0.060*"amazon" + 0.060*"people" + 0.053*"go" + 0.052*"congrats" + 0.049*"world"'),
 (1,
  '0.091*"boxworks" + 0.084*"time" + 0.080*"need" + 0.069*"america" + 0.062*"best" + 0.054*"future" + 0.052*"work" + 0.049*"box" + 0.044*"ever" + 0.044*"business"'),
 (2,
  '0.246*"box" + 0.116*"boxhq" + 0.101*"year" + 0.053*"excited" + 0.051*"today" + 0.051*"u" + 0.038*"great" + 0.028*"working" + 0.028*"moment" + 0.028*"going"'),
 (3,
  '0.104*"one" + 0.064*"software" + 0.059*"startup" + 0.055*"like" + 0.053*"enterprise" + 0.050*"company" + 0.050*"1" + 0.044*"2" + 0.041*"every" + 0.038*"never"')]

In [135]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Supporting Competitors','Culture','Company','Innovation/Launch']

In [136]:
df_levie = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_levie['all_topics']= df_levie[['Supporting Competitors','Culture','Company','Innovation/Launch']].values.tolist()
df_levie['max_topics'] = df_levie['all_topics'].apply(lambda values: get_max_topics(values))

df_levie['Supporting Competitors'] = df_levie.apply(lambda x: assign_topics(x['Supporting Competitors'], x['max_topics']), axis=1)
df_levie['Culture'] = df_levie.apply(lambda x: assign_topics(x['Culture'], x['max_topics']), axis=1)
df_levie['Company'] = df_levie.apply(lambda x: assign_topics(x['Company'], x['max_topics']), axis=1)
df_levie['Innovation/Launch'] = df_levie.apply(lambda x: assign_topics(x['Innovation/Launch'], x['max_topics']), axis=1)

In [137]:
average_topic_weights = df_levie[['Supporting Competitors','Culture','Company','Innovation/Launch']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)
ceo_topics = ceo_topics.reset_index()

In [138]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors


topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Aaron Levie')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Michael Dell - Dell

In [141]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@MichaelDell', 6)
lda_results

The coherence of the LDA model is 0.3929482944080737


[(0,
  '0.134*"delltech" + 0.128*"vmware" + 0.115*"pivotal" + 0.075*"cloud" + 0.074*"dellemc" + 0.050*"pivotalcf" + 0.040*"vmwarensx" + 0.037*"virtustream" + 0.025*"together" + 0.022*"secureworks"'),
 (1,
  '0.135*"thanks" + 0.082*"1" + 0.052*"year" + 0.044*"customer" + 0.044*"delltech" + 0.040*"dellemc" + 0.039*"msdf_foundation" + 0.038*"customer_partner" + 0.036*"making" + 0.033*"texas"'),
 (2,
  '0.093*"data" + 0.072*"delltech" + 0.050*"digital" + 0.049*"world" + 0.045*"help" + 0.040*"join" + 0.038*"transformation" + 0.034*"share" + 0.033*"company" + 0.033*"make"'),
 (3,
  '0.150*"dell" + 0.096*"great" + 0.083*"delltech" + 0.060*"see" + 0.038*"dellemc" + 0.031*"technology" + 0.026*"always" + 0.023*"rsasecurity" + 0.021*"proud" + 0.017*"part"'),
 (4,
  '0.101*"vmware" + 0.078*"pgelsinger" + 0.057*"congratulation" + 0.040*"team" + 0.038*"vmworld" + 0.038*"spoonen" + 0.035*"agree" + 0.033*"great" + 0.033*"delltechworld" + 0.025*"looking_forward"'),
 (5,
  '0.090*"team" + 0.089*"delltec

In [143]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['VMVare','Gratitude','Motto','RSA','Team','Pride']

In [144]:
df_dell = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_dell['all_topics']= df_dell[['VMVare','Gratitude','Motto','RSA','Team','Pride']].values.tolist()
df_dell['max_topics'] = df_dell['all_topics'].apply(lambda values: get_max_topics(values))

df_dell['VMVare'] = df_dell.apply(lambda x: assign_topics(x['VMVare'], x['max_topics']), axis=1)
df_dell['Gratitude'] = df_dell.apply(lambda x: assign_topics(x['Gratitude'], x['max_topics']), axis=1)
df_dell['Motto'] = df_dell.apply(lambda x: assign_topics(x['Motto'], x['max_topics']), axis=1)
df_dell['RSA'] = df_dell.apply(lambda x: assign_topics(x['RSA'], x['max_topics']), axis=1)
df_dell['Team'] = df_dell.apply(lambda x: assign_topics(x['Team'], x['max_topics']), axis=1)
df_dell['Pride'] = df_dell.apply(lambda x: assign_topics(x['Pride'], x['max_topics']), axis=1)

In [146]:
average_topic_weights = df_dell[['VMVare','Gratitude','Motto','RSA','Team','Pride']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)
ceo_topics = ceo_topics.reset_index()

In [147]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors


topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Michael Dell')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### eldsjal (Spotify CEO)

In [148]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@eldsjal', 4)

The coherence of the LDA model is 0.5502476442199072


In [149]:
lda_results

[(0,
  '0.199*"spotify" + 0.154*"sure" + 0.153*"team" + 0.123*"padmasree" + 0.123*"excited" + 0.113*"back" + 0.043*"time" + 0.023*"thank" + 0.009*"spotifyartists" + 0.003*"love"'),
 (1,
  '0.453*"spotify" + 0.081*"mpawlo" + 0.074*"great" + 0.060*"time" + 0.046*"du" + 0.046*"eldsjal" + 0.046*"thank" + 0.039*"playlist" + 0.036*"say" + 0.035*"podcast"'),
 (2,
  '0.118*"new" + 0.117*"today" + 0.110*"good" + 0.110*"year" + 0.090*"like" + 0.090*"yes" + 0.090*"u" + 0.083*"know" + 0.073*"music" + 0.050*"spotifyartists"'),
 (3,
  '0.178*"one" + 0.166*"love" + 0.143*"mootron" + 0.143*"get" + 0.119*"fulhack" + 0.119*"vulfpeck" + 0.057*"take" + 0.004*"team" + 0.003*"spotify" + 0.003*"new"')]

In [151]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Award','Colleague','Future','Artist']

df_spotify = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_spotify['all_topics']= df_spotify[['Award','Colleague','Future','Artist']].values.tolist()
df_spotify['max_topics'] = df_spotify['all_topics'].apply(lambda values: get_max_topics(values))

df_spotify['Award'] = df_spotify.apply(lambda x: assign_topics(x['Award'], x['max_topics']), axis=1)
df_spotify['Colleague'] = df_spotify.apply(lambda x: assign_topics(x['Colleague'], x['max_topics']), axis=1)
df_spotify['Future'] = df_spotify.apply(lambda x: assign_topics(x['Future'], x['max_topics']), axis=1)
df_spotify['Artist'] = df_spotify.apply(lambda x: assign_topics(x['Artist'], x['max_topics']), axis=1)

In [152]:
average_topic_weights = df_spotify[['Award','Colleague','Future','Artist']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [154]:
ceo_topics = ceo_topics.reset_index()

In [155]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors


topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Spotify')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Anand Mahindra - Mahindra Group

In [156]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@anandmahindra', 10)
lda_results

The coherence of the LDA model is 0.37626067254598317


[(0,
  '0.043*"way" + 0.037*"go" + 0.035*"thank" + 0.028*"pic" + 0.024*"week" + 0.024*"take" + 0.022*"2" + 0.021*"1" + 0.021*"tech_mahindra" + 0.019*"mahindraracing"'),
 (1,
  '0.064*"epicchannelin" + 0.063*"day" + 0.046*"well" + 0.043*"never" + 0.040*"happy" + 0.035*"team" + 0.029*"give" + 0.028*"work" + 0.026*"support_passioneering" + 0.025*"fanboost_vote"'),
 (2,
  '0.042*"get" + 0.035*"one" + 0.032*"great" + 0.029*"even" + 0.027*"offthebeatentrack" + 0.024*"country" + 0.019*"much" + 0.019*"always" + 0.018*"list_entry" + 0.018*"today_bucket"'),
 (3,
  '0.028*"mahindra" + 0.027*"design" + 0.027*"name" + 0.022*"new" + 0.020*"anandmahindra" + 0.020*"car" + 0.020*"big" + 0.020*"award" + 0.019*"one" + 0.018*"pininfarina"'),
 (4,
  '0.038*"year" + 0.031*"right" + 0.024*"true" + 0.022*"mahindraracing" + 0.020*"jawa" + 0.020*"anandmahindra" + 0.019*"many" + 0.019*"mumbai" + 0.019*"looking" + 0.018*"jawamotorcycles"'),
 (5,
  '0.063*"today" + 0.036*"yes" + 0.034*"keep" + 0.032*"good" + 0.032

In [157]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Race','Formula1','Travel/Tourism','Launch/Innovation', 'Product',
                      'Personal Story', 'Business', 'Appreciation', 'Electric Car', 'Entertainment']

In [158]:
df_mahindra = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_mahindra['all_topics']= df_mahindra[['Race','Formula1','Travel/Tourism','Launch/Innovation', 'Product',
                      'Personal Story', 'Business', 'Appreciation', 'Electric Car', 'Entertainment']].values.tolist()
df_mahindra['max_topics'] = df_mahindra['all_topics'].apply(lambda values: get_max_topics(values))

df_mahindra['Race'] = df_mahindra.apply(lambda x: assign_topics(x['Race'], x['max_topics']), axis=1)
df_mahindra['Formula1'] = df_mahindra.apply(lambda x: assign_topics(x['Formula1'], x['max_topics']), axis=1)
df_mahindra['Travel/Tourism'] = df_mahindra.apply(lambda x: assign_topics(x['Travel/Tourism'], x['max_topics']), axis=1)
df_mahindra['Launch/Innovation'] = df_mahindra.apply(lambda x: assign_topics(x['Launch/Innovation'], x['max_topics']), axis=1)
df_mahindra['Product'] = df_mahindra.apply(lambda x: assign_topics(x['Product'], x['max_topics']), axis=1)
df_mahindra['Personal Story'] = df_mahindra.apply(lambda x: assign_topics(x['Personal Story'], x['max_topics']), axis=1)
df_mahindra['Business'] = df_mahindra.apply(lambda x: assign_topics(x['Business'], x['max_topics']), axis=1)
df_mahindra['Appreciation'] = df_mahindra.apply(lambda x: assign_topics(x['Appreciation'], x['max_topics']), axis=1)
df_mahindra['Electric Car'] = df_mahindra.apply(lambda x: assign_topics(x['Electric Car'], x['max_topics']), axis=1)
df_mahindra['Entertainment'] = df_mahindra.apply(lambda x: assign_topics(x['Entertainment'], x['max_topics']), axis=1)


In [159]:
average_topic_weights = df_mahindra[['Race','Formula1','Travel/Tourism','Launch/Innovation', 'Product',
                      'Personal Story', 'Business', 'Appreciation', 'Electric Car', 'Entertainment']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [160]:
ceo_topics = ceo_topics.reset_index()

In [161]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors


topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Spotify')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

#### Mary Barra - GM

In [162]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@mtbarra', 3)
lda_results

The coherence of the LDA model is 0.43600020608260975


[(0,
  '0.563*"gm" + 0.107*"future" + 0.083*"proud" + 0.077*"vision" + 0.072*"work" + 0.060*"congratulation" + 0.003*"world" + 0.003*"today" + 0.003*"thanks" + 0.003*"stem"'),
 (1,
  '0.174*"year" + 0.153*"car" + 0.152*"thank" + 0.131*"business" + 0.121*"driving" + 0.110*"self" + 0.064*"great" + 0.050*"congratulation" + 0.004*"woman" + 0.004*"time"'),
 (2,
  '0.175*"today" + 0.141*"great" + 0.124*"world" + 0.117*"team" + 0.103*"stem" + 0.096*"thanks" + 0.088*"woman" + 0.081*"time" + 0.040*"proud" + 0.007*"work"')]

In [163]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Company Vision','Products','Social/Appreciation']

In [164]:
df_mtbarra = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_mtbarra['all_topics']= df_mtbarra[['Company Vision','Products','Social/Appreciation']].values.tolist()
df_mtbarra['max_topics'] = df_mtbarra['all_topics'].apply(lambda values: get_max_topics(values))

df_mtbarra['Company Vision'] = df_mtbarra.apply(lambda x: assign_topics(x['Company Vision'], x['max_topics']), axis=1)
df_mtbarra['Products'] = df_mtbarra.apply(lambda x: assign_topics(x['Products'], x['max_topics']), axis=1)
df_mtbarra['Social/Appreciation'] = df_mtbarra.apply(lambda x: assign_topics(x['Social/Appreciation'], x['max_topics']), axis=1)

In [165]:
average_topic_weights = df_mtbarra[['Company Vision','Products','Social/Appreciation']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [166]:
ceo_topics = ceo_topics.reset_index()

In [167]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

from palettable.cmocean.sequential import Ice_10
colors = Ice_10.hex_colors


topics_pie = go.Pie(labels=ceo_topics["index"], values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Mary Barra')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')