# Cyber Security Tweet Analysis

In [None]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
sns.set(rc={'figure.figsize':(6,8)}) 

import warnings
warnings.simplefilter("ignore")

%matplotlib inline 

from sklearn.preprocessing import LabelEncoder 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.metrics import SCORERS
import re

import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.probability import FreqDist

from sklearn.preprocessing import MinMaxScaler

## Functions

In [None]:
#Creating a function that takes in a range of values for K (or how many clusters)
# runs the model and then outputs the inertia value so we can see how well the clusters are grouped 


def evaluate_k_kmeans_inertia(k, vec):
    print(f"running Kmeans with k={k}")
    estimator_kmeans = KMeans(random_state=42, n_clusters=k)
    estimator_kmeans.fit(vec)
    return estimator_kmeans.inertia_

In [None]:
def splitData(data, len_of_frame):
    n = len(data.index) 
    split_n = round(n/len_of_frame, 0)
    print(split_n)
    splits = [int(x) for x in range(int(split_n))]
    lst = []
    for splt in splits:
        if splt == splits[-1]:
            lst.append(data[int(splt * len_of_frame):])
        else:
            lst.append(data[int(splt * len_of_frame):int(len_of_frame * (1 + splt))])
    return lst

In [None]:
def cluster_summary(ci, df):
    cluster = df[df.cluster_id==ci]
    cluster_summary = cluster[categorical_data.columns].mode().to_dict(orient="records")[0]
    cluster_summary.update(cluster.mean().to_dict())
    return cluster_summary

In [None]:
# we can make a function that puts comparisons side by side so its easier for us to see in a data frame 

def cluster_comparison(*cluster_ids):
    summaries = []
    for cluster_id in cluster_ids:
        summaries.append(cluster_summary(cluster_id, df2))
    return pd.DataFrame(summaries).set_index("cluster_id").T

In [None]:
def kmeans_cluster(df, n_clusters=2):
    """This function assigns clusters to every row in the dataframe via kmeans"""
    model = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = model.fit_predict(df)
    cluster_results = df.copy()
    cluster_results['Cluster'] = clusters
    return cluster_results

In [None]:
def graph_summary_clustering(results):
    """this function produces a summary of the clusters"""
    cluster_size = results.groupby(['Cluster']).size().reset_index()
    cluster_size.columns = ['Cluster', 'Count']
    cluster_means = results.groupby(['Cluster'], as_index=False).mean()
    cluster_summary = pd.merge(cluster_size, cluster_means, on='Cluster')
    cluster_summary = cluster_summary.drop(["Count"], axis=1).set_index("Cluster")
    return cluster_summary[sorted(cluster_summary.columns)]

## Text Analysis

### Word Frequency 

In [None]:
cs_words = []

for row in text['text']:
    for word in str(row).split(" "):
        cs_words.append(word)
        
cs_words[:5]

In [None]:
len(cs_words)

In [None]:
fdist = FreqDist()
for word in cs_words:
    fdist[word]+=1
fdist

In [None]:
freq_data = pd.DataFrame.from_dict(fdist.most_common(20))
sns.barplot(data = freq_data, x = 1, y = 0);

In [None]:
most_common = []

for key in fdist.most_common(25):
    most_common.append(key[0])
    
print(most_common)

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
stopwords = set(STOPWORDS)
new_stopwords=stopwords.union(text)
wc = WordCloud(
         background_color="white", max_words=2000, 
         min_font_size =15, max_font_size=40, relative_scaling = 
         0.5, stopwords=new_stopwords,normalize_plurals= True)
import re 
textonly = re.sub(" ", " ",str(text))
wc.generate(textonly)
plt.figure(figsize=(25,25))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

#Show the wordcloud
plt.show()
# devil is interesting, Intel is possibly the company, 
# Jeanette Manfra is commonly found when searching jeannette
# and cybersecurity
# Microsoft Azure Sentinel is a scalable, cloud-native, security information 
# event management (SIEM) and security orchestration automated response (SOAR) solution. 
# Los Angeles and Pakistan are the only locations, chinese is only nationality

### Sentiment Analysis

In [None]:
sid = SentimentIntensityAnalyzer()
text['scores'] = text['text'].apply(lambda review: sid.polarity_scores(review))
text.head()

In [None]:
text['compound']  = text['scores'].apply(lambda score_dict: score_dict['compound'])
text.head()

In [None]:
text['comp_score'] = text['compound'].apply(lambda c: 'pos' if c >=0.05 else ('neg' if c <= -0.05 else 'neutral'))
text.head()

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
comp_score_counts = text.comp_score.value_counts()
comp_score_counts.plot.pie(autopct="%.1f%%");

### Cluster Analysis

In [None]:
%%time 
range_k = [i for i in range(1, 9)]
results_k = {}
for k in range_k:
    results_k[k] = evaluate_k_kmeans_inertia(k, text_vec)

In [None]:
# we can now plot this to see where the inflection point is or where adding more clusters doesnt really add to gaining
# more information from extra clusters

fig, ax = plt.subplots(figsize=(8,6))
ax = sns.lineplot(
    [c[0] for c in results_k.items()],
    [c[1] for c in results_k.items()], label="inertia", color="red")
ax.set_xlabel("K")
ax.set_ylabel("inertia")
ax.set_title("Inertia by K")
ax.legend();

In [None]:
estimator_kmeans = KMeans(random_state=42, n_clusters=6) # making it 6 now to see what happens

estimator_kmeans.fit(text_vec) 

In [None]:
text_vec["cluster_id"] = estimator_kmeans.labels_
text_vec.head()

In [None]:
text_clusters = text_vec.groupby('cluster_id').sum()
text_clusters.head()

In [None]:
text_clusters_20 = text_clusters[['data', 'new', 'attack', 'infosec', 'ransomware', 'amp', 'business', 'threat', 'hacker', 'ai', 'hacking', 'cyberattack', 'company', 'learn', 'risk', '2021', 'ethicalhacking', 'cloud', 'cybersecuritytips', 'cybersecuritynews', 'iotcybersec24', 'help', 'system', 'need']]
text_clusters_20.head() # only focusing on top 20 post popular words ('u' not in data frame oddly)

In [None]:
fig, ax = plt.subplots(figsize=(25,15))
sns.heatmap(text_clusters_20.transpose(), annot=False)
plt.title("Cluster Analysis based on Twitter Posts using the Top 25 Words", size = 30);
# Look for rows with a lot of variation to see defining features

## Indentifying Variable Analysis

### Word Frequency | Description

In [None]:
desc_words = []

for row in desc['description']:
    for word in str(row).split(" "):
        desc_words.append(word)
        
desc_words[:5]

In [None]:
fdist_desc = FreqDist()
for word in desc_words:
    fdist_desc[word]+=1
fdist_desc

In [None]:
freq_data_desc = pd.DataFrame.from_dict(fdist_desc.most_common(20))
sns.barplot(data = freq_data_desc, x = 1, y = 0);

In [None]:
most_common_desc = []

for key in fdist_desc.most_common(20):
    most_common_desc.append(key[0])
    
most_common_desc

### Cluster Analysis | Description

In [None]:
%%time 
range_k = [i for i in range(1, 9)]
results_k = {}
for k in range_k:
    results_k[k] = evaluate_k_kmeans_inertia(k, desc_vec)

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.lineplot(
    [c[0] for c in results_k.items()],
    [c[1] for c in results_k.items()], label="inertia", color="red")
ax.set_xlabel("K")
ax.set_ylabel("inertia")
ax.set_title("Inertia by K")
ax.legend();

In [None]:
estimator_kmeans = KMeans(random_state=42, n_clusters=5) # 5 for now (the elbow curve is janky)

estimator_kmeans.fit(desc_vec)

In [None]:
desc_vec["cluster_id"] = estimator_kmeans.labels_
desc_vec.head()

In [None]:
desc_clusters = desc_vec.groupby('cluster_id').sum()
desc_clusters.head()

In [None]:
desc_clusters_20 = desc_clusters[['business',
 'technology',
 'service',
 'solution',
 'news',
 'digital',
 'tech',
 'data',
 'cloud',
 'company',
 'help',
 'tweet',
 'information',
 'world',
 'software',
 'management',
 'global',
 'ai',
 'professional']]

desc_clusters_20

In [None]:
fig, ax = plt.subplots(figsize=(25,15))
sns.heatmap(desc_clusters_20.transpose(), annot=False)
plt.title("Cluster Analysis based on Twitter Description using the Top 20 Words", size = 30);
# Look for rows with a lot of variation to see defining features

### Cluster Analysis | Other Columns

#### Processing Data

In [None]:
ident = data[["created_at", "is_quote", "retweet_count", "location", "followers_count", "friends_count", "listed_count", "account_created_at", "verified"]][:1000]
ident.head() # remove is_retweet and protected, no variance

In [None]:
numerical_data = ident.select_dtypes(np.number)
categorical_data = ident.drop(numerical_data, axis = 1)

In [None]:
numerical_data_normalized = MinMaxScaler().fit_transform(numerical_data) # this line scales our data

numerical_data_normalized = pd.DataFrame(              #this pops it back in data frame format 
    numerical_data_normalized,
    columns=numerical_data.columns) 

numerical_data_normalized.head()

In [None]:
categorical_data_codified = pd.get_dummies(
                                    categorical_data, 
                                    drop_first=True,
                                    dtype="int64"
).reset_index()
categorical_data_codified.head()

In [None]:
df2_processed = pd.concat([
                            numerical_data_normalized,
                            categorical_data_codified
                        ], axis=1
).reset_index()

In [None]:
df2_processed.head()

#### Analysis

In [None]:
%%time 
range_k = [2, 3, 4, 5, 8, 10, 15, 20, 25, 30]
results_k = {}
for k in range_k:
    results_k[k] = evaluate_k_kmeans_inertia(k, df2_processed)

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.lineplot(
    [c[0] for c in results_k.items()],
    [c[1] for c in results_k.items()], label="inertia", color="red")
ax.set_xlabel("K")
ax.set_ylabel("inertia")
ax.set_title("Inertia by K")
ax.legend();

In [None]:
estimator_kmeans = KMeans(random_state=42, n_clusters=10) # 5 for now (the elbow curve is janky)

estimator_kmeans.fit(df2_processed)

In [None]:
df2["cluster_id"] = estimator_kmeans.labels_
df2.head()

In [None]:
df2.cluster_id.value_counts()

In [None]:
cluster_comparison(0,1,2,3,4,5,6,7,8,9)

In [None]:
cluster_results = kmeans_cluster(df2_processed, 5)
cluster_summary2 = graph_summary_clustering(cluster_results)

In [None]:
#cluster_summary2 = cluster_summary2.drop('cluster_id', axis = 1)
cluster_summary2

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cluster_summary2.transpose(), annot=False);