# Strategic Analysis of Trump Rallies with NLP

- Initially obtained dataset of 35 rallies from Kaggle but they were found to be too few for time series.
- Web scrapped about 100 Trump rallies from https://factba.se/ from first rally in 2017 to the first rally in 2020 when Trump recovered from Covid-19.
- Planned to do topic modelling on all Trump rallies, and develop a time series analysis of the rallies. Where possible, I plan to make time series predictions of his trending topics.
- Also possible to do clustering of Trump rallies based on topics.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF  
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error,silhouette_score

from gensim.parsing.preprocessing import remove_stopwords
from wordcloud import WordCloud
from nltk.sentiment import vader
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#nltk.download('stopwords')

from statsmodels.tsa.statespace.sarimax import SARIMAX

from plotly.offline import iplot

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 2000)
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/donald-trump-political-rallies-transcripts/Trump_Rallies_Dataset.csv',parse_dates=['Date'])

In [None]:
df = df.drop('Unnamed: 0',axis=1)
df = df.drop(97, axis=0)
df = df.reset_index().drop('index',axis=1)

## Text Pre-processing
- Regular Expression
- Tokenization using NLTK Regex Tokenizer
- Stopwords removal using Gensim

In [None]:
df['Place'] = df['Place'].apply(lambda x: re.sub(r'^.*(?=Rally)','',x))

In [None]:
df['Transcript'] = df['Transcript'].apply(lambda x: re.sub(r'\[.*?\]','',x))

In [None]:
df['Transcript'] = df['Transcript'].apply(lambda x: x.lower())

In [None]:
tokenizer = RegexpTokenizer('[a-z][a-z]+[a-z]')

In [None]:
df['Transcript'] = df['Transcript'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
df['Transcript'] = df['Transcript'].apply(lambda x: ' '.join(x))

In [None]:
df['Transcript'] = df['Transcript'].apply(lambda x: x.replace('\\',''))

In [None]:
# def nltk_tag_to_wordnet_tag(nltk_tag):
#     if nltk_tag.startswith('J'):
#         return wordnet.ADJ
#     elif nltk_tag.startswith('V'):
#         return wordnet.VERB
#     elif nltk_tag.startswith('N'):
#         return wordnet.NOUN
#     elif nltk_tag.startswith('R'):
#         return wordnet.ADV
#     else:          
#         return None

In [None]:
# def lemmatize_sentence(sentence):
#     #tokenize the sentence and find the POS tag for each token
#     tokenizer = RegexpTokenizer('[a-z][a-z]+[a-z]')
#     nltk_tagged = nltk.pos_tag(tokenizer.tokenize(sentence))  
#     #tuple of (token, wordnet_tag)
#     wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
#     lemmatized_sentence = []
#     for word, tag in wordnet_tagged:
#         if tag is None:
#             #if there is no available tag, append the token as is
#             lemmatized_sentence.append(word)
#         else:        
#             #else use the tag to lemmatize the token
#             lemmatized_sentence.append(WordNetLemmatizer().lemmatize(word, tag))
#     return " ".join(lemmatized_sentence)

In [None]:
#df['Transcript'] = df['Transcript'].apply(lambda x: lemmatize_sentence(x))

In [None]:
df['Transcript'] = df['Transcript'].apply(lambda x: remove_stopwords(x))

In [None]:
df.loc[0,'Transcript']

In [None]:
df

## Exploratory Data Analysis
- Using WordCloud on first 2 and last 2 rallies

In [None]:
wordcloud = WordCloud(max_words=30, background_color='white',colormap='magma',width=800, height=400, random_state=48).generate(df.loc[100,'Transcript'])
plt.figure(figsize=[10,5])
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('wordcloud1.png',transparent=True, bbox_inches='tight')

In [None]:
wordcloud = WordCloud(max_words=30, background_color='white',colormap='magma',width=800, height=400, random_state=71).generate(df.loc[101,'Transcript'])
plt.figure(figsize=[10,5])
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('wordcloud2.png',transparent=True, bbox_inches='tight')

In [None]:
wordcloud = WordCloud(max_words=30,background_color='white',colormap='seismic',width=800, height=400,random_state=28).generate(df.loc[0,'Transcript'])
plt.figure(figsize=[10,5])
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('wordcloud3.png',transparent=True, bbox_inches='tight')

In [None]:
wordcloud = WordCloud(max_words=30, background_color='white',colormap='seismic',width=800, height=400,random_state=48).generate(df.loc[1,'Transcript'])
plt.figure(figsize=[10,5])
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('wordcloud4.png',transparent=True, bbox_inches='tight')

## Topic Modeling and Exploratory Data Analysis with Heatmap
- Scikit-learn Count Vectorizer
- Scikit-learn Non-negative Matrix Factorization (NMF)
- Scikit-learn Cosine Similarity

In [None]:
count_vectorizer = CountVectorizer(stop_words = 'english')
doc_term = count_vectorizer.fit_transform(list(df['Transcript']))
countvec = count_vectorizer.fit(list(df['Transcript']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in df['Place']], columns=count_vectorizer.get_feature_names()).head(10)
dt_matrix 

### Try NMF with Countvectorizer (Random = 42)

In [None]:
nmf_model = NMF(n_components = 3,random_state=42)
model = nmf_model.fit(doc_term)
doc_topic = model.transform(doc_term)

In [None]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2","component_3"],
             columns = count_vectorizer.get_feature_names())
topic_word

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(nmf_model, count_vectorizer.get_feature_names(), 30)

In [None]:
topic_index = []
feature_names = count_vectorizer.get_feature_names()
for ix, topic in enumerate(nmf_model.components_):
    topic_index.append(" ".join([feature_names[i] for i in topic.argsort()]))

In [None]:
H = pd.DataFrame(doc_topic.round(3),
             index = [i for i in df['Place']],
             columns = ["component_1","component_2","component_3"])
H

In [None]:
H = H.reset_index()
H = df.join(H).drop(['Transcript','index'],axis=1)
H

In [None]:
H = H.rename(columns={'component_1':'Achievements','component_2':'Plans and Appealing Support','component_3':'Political Adversaries'})
H

In [None]:
DF = H.copy()
DF = DF.drop(['Place','Date'],axis=1)
DF = DF.iloc[:21]
cos_similar_matrix = pd.DataFrame(cosine_similarity(DF.values),columns=H['Date'].iloc[:21].astype(str),index=H['Date'].iloc[:21].astype(str))
cos_similar_matrix

In [None]:
sns.set(style='white',font_scale=1.4)
fig = plt.figure(figsize=[20,18])
mask = np.triu(np.ones_like(cos_similar_matrix, dtype=bool))
sns.heatmap(cos_similar_matrix,cmap='Blues',linewidth=3,linecolor='white',vmax = 1, vmin=0.1,mask=mask, annot=True,fmt='0.2f')
plt.title('Cosine Similarity Heatmap - Last 20 Rallies', weight='bold',fontsize=25)
plt.xlabel('')
plt.ylabel('')
plt.savefig('heatmap.png',transparent=True, bbox_inches='tight')

## Time Series with SARIMA
- Create Simple Validation manually
- Observe seasonality in topics in Trump rallies
- Time Series is observed to be non-stationary with trend and seasonality => SARIMA
- Manually optimize the period and trend parameter in SARIMA
- MAE is used as a metric for prediction

In [None]:
data = H.drop(['Place','Date'],axis=1)
data.index = H['Date']
data

In [None]:
data = data.iloc[::-1]
data

In [None]:
train = data[:int(0.90*(len(data)))]
train

In [None]:
valid = data[int(0.90*(len(data))):int(0.95*(len(data)))]
valid

In [None]:
test = data[int(0.95*(len(data))):]
test

In [None]:
train_Achievements = train['Achievements']
train_Support = train['Plans and Appealing Support']
train_Adversaries = train['Political Adversaries']
valid_Achievements = valid['Achievements']
valid_Support = valid['Plans and Appealing Support']
valid_Adversaries = valid['Political Adversaries']
test_Achievements = test['Achievements']
test_Support = test['Plans and Appealing Support']
test_Adversaries = test['Political Adversaries']

In [None]:
# mae_vector= []

# for period in np.arange(8,13,1):    
#     for trend in ['n','c','t','ct']:
#         for p in [1]:
#             for d in [0]:
#                 for q in [0]:
#                     for P in [1]:
#                         for D in [1]:
#                             for Q in [1]:
#                                 forecast_Adversaries = SARIMAX(train_Adversaries,order=(p,d,q),seasonal_order=(P,D,Q,period),trend=trend).fit().forecast(steps=len(valid_Adversaries))
#                                 MAE_Adversaries = mean_absolute_error(valid['Political Adversaries'],forecast_Adversaries) 
#                                 mae_vector.append((MAE_Adversaries,[(p,d,q),(P,D,Q,period),trend]))
#     print(period)

# mae, para = zip(*mae_vector)        

# print(f'Best Parameters is {para[np.argmin(mae)]}')    
# print(f'Lowest MAE is {min(mae)}') 

In [None]:
# mae_vector= []

# for period in np.arange(8,13,1):    
#     for trend in ['n','c','t','ct']:
#         for p in [1]:
#             for d in [0]:
#                 for q in [0]:
#                     for P in [1]:
#                         for D in [1]:
#                             for Q in [1]:
#                                 forecast_Support = SARIMAX(train_Support,order=(p,d,q),seasonal_order=(P,D,Q,period),trend=trend).fit().forecast(steps=len(valid_Support))
#                                 MAE_Support = mean_absolute_error(valid['Plans and Appealing Support'],forecast_Support) 
#                                 mae_vector.append((MAE_Support,[(p,d,q),(P,D,Q,period),trend]))
#     print(period)

# mae, para = zip(*mae_vector)        

# print(f'Best Parameters is {para[np.argmin(mae)]}')    
# print(f'Lowest MAE is {min(mae)}') 

In [None]:
# mae_vector= []

# for period in np.arange(8,13,1):    
#     for trend in ['n','c','t','ct']:
#         for p in [1]:
#             for d in [0]:
#                 for q in [0]:
#                     for P in [1]:
#                         for D in [1]:
#                             for Q in [1]:
#                                 forecast_Achievements = SARIMAX(train_Achievements,order=(p,d,q),seasonal_order=(P,D,Q,period),trend=trend).fit().forecast(steps=len(valid_Achievements))
#                                 MAE_Achievements = mean_absolute_error(valid['Achievements'],forecast_Achievements) 
#                                 mae_vector.append((MAE_Achievements,[(p,d,q),(P,D,Q,period),trend]))
#     print(period)

# mae, para = zip(*mae_vector)        

# print(f'Best Parameters is {para[np.argmin(mae)]}')    
# print(f'Lowest MAE is {min(mae)}') 

In [None]:
trainvalid = data[:int(0.95*(len(data)))]
trainvalid

In [None]:
trainvalid_Achievements = trainvalid['Achievements']
trainvalid_Support = trainvalid['Plans and Appealing Support']
trainvalid_Adversaries = trainvalid['Political Adversaries']

In [None]:
forecast_Adversaries = SARIMAX(trainvalid_Adversaries,order=(1,0,0),seasonal_order=(1,1,1,10),trend='ct').fit().forecast(steps=len(test_Adversaries))
MAE_Adversaries = mean_absolute_error(test['Political Adversaries'],forecast_Adversaries)
print(f'MAE_Adversaries: {MAE_Adversaries}')
    
forecast_Support = SARIMAX(trainvalid_Support,order=(1,0,0),seasonal_order=(1,1,1,8),trend='ct').fit().forecast(steps=len(test_Support))
MAE_Support = mean_absolute_error(test['Plans and Appealing Support'],forecast_Support) 
print(f'MAE_Support: {MAE_Support}')
    
forecast_Achievements = SARIMAX(trainvalid_Achievements,order=(1,0,0),seasonal_order=(1,1,1,9),trend='n').fit().forecast(steps=len(test_Achievements))
MAE_Achievements = mean_absolute_error(test['Achievements'],forecast_Achievements)  
print(f'MAE_Achievements: {MAE_Achievements}')

In [None]:
predicted = test.copy()
predicted['Achievements'] = list(forecast_Achievements)
predicted['Plans and Appealing Support'] = list(forecast_Support)
predicted['Political Adversaries'] = list(forecast_Adversaries)
predicted

In [None]:
test

In [None]:
predicted2 = predicted.copy()
predicted2 = predicted2.reset_index()
predicted2['Date'] = predicted2['Date'].astype(str)

In [None]:
H2 = H.copy()
H2['Date'] = H2['Date'].astype(str)

In [None]:
H3 = H2.copy()
H3 = H3.iloc[::-1]

In [None]:
sns.set(style='white',font_scale=5,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[80,20])
ax = sns.lineplot(x='Date',y='Political Adversaries',data=H2,style=True,markers=True,ci=None,color='r')
plt.xticks(list(H2['Date'])[0::20])
#ymin, ymax = plt.ylim()
#plt.vlines(list(H2['Date'])[3::10], ymin, ymax, linestyle='dashed')
plt.xlabel('Dates of Rallies',weight='bold',fontsize=60)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=60)
plt.title('Trump Attacking Political Adversaries',weight='bold',fontsize=80)
sns.despine()
plt.savefig('SNSlineplot1_0.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=5,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[80,20])
ax = sns.lineplot(x=H2.index[::-1],y=H2['Political Adversaries'],style=True,markers=True,ci=None,color='r')
lst = ['']*102
lst[-4::-10] = list(H2['Date'][3::10])
plt.xticks(ticks=range(1,103),labels=lst)
ymin, ymax = plt.ylim()
plt.vlines(np.arange(8,103,10), ymin, ymax, linestyle='dashed')
plt.xlabel('Dates of Rallies',weight='bold',fontsize=60)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=60)
plt.title('Trump Attacking Political Adversaries',weight='bold',fontsize=80)
sns.despine()
plt.savefig('SNSlineplot1.png',transparent=True, bbox_inches='tight')

In [None]:
Range = np.arange(8,103,10)

In [None]:
sns.set(style='white',font_scale=1.2,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[9,6])
#sns.lineplot(x='Date',y='Political Adversaries',data=H2,style=True,markers=True,ci=None,color='r')
plt.plot(H3['Date'],H3['Political Adversaries'],marker='.',markersize=10,linewidth=1.5,linestyle='-',color='r',label='Actual')
plt.plot(predicted2['Date'],predicted2['Political Adversaries'],marker='.',markersize=10,linewidth=2,linestyle='--',color='k',label='Predicted')
plt.xticks(list(H2['Date'])[3::10])
ymin, ymax = plt.ylim()
plt.vlines(list(H2['Date'])[3::10], ymin, ymax, linestyle=':')
plt.xlim(['2019-11-06','2020-10-12'])
plt.xlabel('Dates of Rallies',weight='bold',fontsize=15)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=15)
plt.title('Trump Attacks Political Adversaries',weight='bold',fontsize=15)
plt.legend()
sns.despine()
plt.savefig('PLTlineplot1.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=5,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[80,20])
ax = sns.lineplot(x='Date',y='Plans and Appealing Support',data=H2,style=True,markers=True,ci=None,color='g')
plt.xticks(list(H2['Date'])[0::20])
plt.ylim([0,10])
#ymin, ymax = plt.ylim()
#plt.vlines(list(H2['Date'])[5::8], ymin, ymax, linestyle='dashed')
plt.yticks([0,2,4,6,8,10])
plt.xlabel('Dates of Rallies',weight='bold',fontsize=60)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=60)
plt.title('Trump Talks About Plans and Appeals for Support',weight='bold',fontsize=80)
sns.despine()
plt.savefig('SNSlineplot2_0.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=5,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[80,20])
ax = sns.lineplot(x=H2.index[::-1],y=H2['Plans and Appealing Support'],style=True,markers=True,ci=None,color='g')
lst = ['']*102
lst[-6::-8] = list(H2['Date'][5::8])
plt.xticks(ticks=range(1,103),labels=lst)
plt.ylim([0,10])
ymin, ymax = plt.ylim()
plt.vlines(np.arange(0,103,8), ymin, ymax, linestyle='dashed')
plt.yticks([0,2,4,6,8,10])
plt.xlabel('Dates of Rallies',weight='bold',fontsize=60)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=60)
plt.title('Trump Talks About Plans and Appeals for Support',weight='bold',fontsize=80)
sns.despine()
plt.savefig('SNSlineplot2.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=1.2,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[9,6])
#sns.lineplot(x='Date',y='Political Adversaries',data=H2,style=True,markers=True,ci=None,color='r')
plt.plot(H3['Date'],H3['Plans and Appealing Support'],marker='.',markersize=10,linewidth=1.5,linestyle='-',color='g',label='Actual')
plt.plot(predicted2['Date'],predicted2['Plans and Appealing Support'],marker='.',markersize=10,linewidth=2,linestyle='--',color='k',label='Predicted')
plt.xticks(list(H2['Date'])[5::8])
plt.ylim([0,10])
ymin, ymax = plt.ylim()
plt.vlines(list(H2['Date'])[5::8], ymin, ymax, linestyle=':')
plt.xlim(['2019-11-06','2020-10-12'])
plt.yticks([0,2,4,6,8,10])
plt.xlabel('Dates of Rallies',weight='bold',fontsize=15)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=15)
plt.title('Trump Talks About Plans, Appeals for Support',weight='bold',fontsize=15)
plt.legend()
sns.despine()
plt.savefig('PLTlineplot2.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=5,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[80,20])
ax = sns.lineplot(x='Date',y='Achievements',data=H2,style=True,markers=True,ci=None)
plt.xticks(list(H2['Date'])[0::20])
#ymin, ymax = plt.ylim()
#plt.vlines(list(H2['Date'])[8::9], ymin, ymax, linestyle='dashed')
plt.ylabel('Achievements and Bragging')
plt.xlabel('Dates of Rallies',weight='bold',fontsize=60)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=60)
plt.title('Trump Brags about Achievement and Progress',weight='bold',fontsize=80)
sns.despine()
plt.savefig('SNSlineplot3_0.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=5,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[80,20])
ax = sns.lineplot(x=H2.index[::-1],y=H2['Achievements'],style=True,markers=True,ci=None)
lst = ['']*102
lst[-9::-9] = list(H2['Date'][8::9])
plt.xticks(ticks=range(1,103),labels=lst)
ymin, ymax = plt.ylim()
plt.vlines(np.arange(3,102,9), ymin, ymax, linestyle='dashed')
plt.ylabel('Achievements and Bragging')
plt.xlabel('Dates of Rallies',weight='bold',fontsize=60)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=60)
plt.title('Trump Brags about Achievement and Progress',weight='bold',fontsize=80)
sns.despine()
plt.savefig('SNSlineplot3.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=1.2,rc = {'lines.markersize': 15} )
fig = plt.figure(figsize=[9,6])
#sns.lineplot(x='Date',y='Political Adversaries',data=H2,style=True,markers=True,ci=None,color='r')
plt.plot(H3['Date'],H3['Achievements'],marker='.',markersize=10,linewidth=1.5,linestyle='-',color='b',label='Actual')
plt.plot(predicted2['Date'],predicted2['Achievements'],marker='.',markersize=10,linewidth=2,linestyle='--',color='k',label='Predicted')
plt.xticks(list(H2['Date'])[8::9])
ymin, ymax = plt.ylim()
plt.vlines(list(H2['Date'])[8::9], ymin, ymax, linestyle=':',)
plt.xlim(['2019-11-06','2020-10-12'])
plt.xlabel('Dates of Rallies',weight='bold',fontsize=15)
plt.ylabel('Topic Relevance in Rally',weight='bold',fontsize=15)
plt.title('Trump Brags about Achievement and Progress',weight='bold',fontsize=15)
plt.legend()
sns.despine()
plt.savefig('PLTlineplot3.png',transparent=True, bbox_inches='tight')

## Clustering of Trump Rallies
- K-means for Clustering
- Used Elbow method and Silhouette coefficients to determine optimum clusters
- Used 3D plot in Plotly to visualize clusters

In [None]:
sns.set(style='white',font_scale=1)
inertia = []
for num_clusters in range(1,11):
    km = KMeans(n_clusters=num_clusters,random_state=71)
    km.fit(H[['Achievements','Plans and Appealing Support','Political Adversaries']])
    inertia.append(km.inertia_)
    
plt.plot(range(1,11),inertia,marker='x')
plt.ylabel('Sum Inertia',fontsize=15,weight='bold')
plt.xlabel('No. of Clusters',fontsize=15,weight='bold')
sns.despine()
plt.savefig('Elbow.png',transparent=True, bbox_inches='tight')

In [None]:
sns.set(style='white',font_scale=1)
silhouette = []
for num_clusters in range(2,11):
    km = KMeans(n_clusters=num_clusters,random_state=71)
    km.fit(H[['Achievements','Plans and Appealing Support','Political Adversaries']])
    labels = km.labels_
    silhouette.append(silhouette_score(H[['Achievements','Plans and Appealing Support','Political Adversaries']],labels=labels))
    
plt.plot(range(2,11),silhouette,marker='x',color='g')
plt.ylabel('Silhouette Coefficient',fontsize=15,weight='bold')
plt.xlabel('No. of Clusters',fontsize=15,weight='bold')
sns.despine()
plt.savefig('Silhouette.png',transparent=True, bbox_inches='tight')

In [None]:
km = KMeans(n_clusters=2,random_state=71)
km.fit(H[['Achievements','Plans and Appealing Support','Political Adversaries']])
labels = km.labels_
labels

In [None]:
labels = np.where(labels==0,'Trump Mainly Attacking','Trump Mainly Bragging')


In [None]:
data1 = []
clusters = []
colors = ['rgb(228,26,28)','rgb(55,126,184)'] # set our dot colors

for i in range(len(np.unique(labels))): # allows us to split our data into three distinct groups
    name = np.unique(labels)[i]
    color = colors[i]
    x = H[ labels == name ]['Achievements']
    y = H[ labels == name  ]['Plans and Appealing Support']
    z = H[ labels == name  ]['Political Adversaries']
    
    trace = dict(  # trace is how we "trace" or draw our data on the canvas
        name = name,
        x = x, y = y, z = z,
        type = "scatter3d",    
        mode = 'markers',
        marker = dict( size=2, color=color, line=dict(width=0) ) )
    data1.append( trace )

layout = dict( # we modify our canvas here, including initial layout and styles
    width=800,
    height=550,
    autosize=True,
    title='Trump Rally Topic Clusters',
    scene=dict(
        xaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)',
            title='Achievements and Bragging',  # set titles, very important
            titlefont=dict(
            family='Courier New',
            size=9,
            color='#2f2f2f'),  # we can use hex, rgba, or other color variants
        ),
        yaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)',
            title='Plans and Appealing Support',  # set titles, very important
            titlefont=dict(
            family='Courier New',
            size=9,
            color='#4f4f4f'),
        ),
        zaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)',
            title='Political Adversaries',  # set titles, very important
            titlefont=dict(
            family='Courier New',
            size=9,
            color='#7f7f7f'),
        ),
        aspectratio = dict( x=1, y=1, z=1 ), # we can compress large dimensions this way
        aspectmode = 'manual'        
    ),
)

fig = dict(data=data1, layout=layout) # this finally compiles our figure

# run locally in notebook
iplot(fig)