### Analyzing graph of relationships between username and twitter handles used in tweets for both tweets about ISIS and for the ISIS fanboys. Find that the most central Twitter handle for the ISIS fanboys by far is someone called 'Uncle_SamCoco', followed by 'RamiAlLolah', 'WarReporter1','MaghrabiArabi' and 'mobi_ayubi'

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib
from matplotlib import *
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
about_data = pd.read_csv('../input/AboutIsis.csv',encoding = "ISO-8859-1")
fanboy_data = pd.read_csv('../input/IsisFanboy.csv',encoding = "ISO-8859-1")

In [None]:
about_data.keys()

In [None]:
fanboy_space_split = [str(i).split() for i in fanboy_data['tweets']]
fanboy_handles = [j for i in fanboy_space_split for j in i if '@' in j]
about_space_split = [str(i).split() for i in about_data['tweets']]
about_handles = [j for i in about_space_split for j in i if '@' in j]

In [None]:
print(len(set(fanboy_data['username']))/len(set(fanboy_handles)),
      len(set(about_data['username']))/len(set(about_handles)))

In [None]:
import networkx as nx

In [None]:
fanboy_edges = [(k,j[1:]) for k,i in zip(fanboy_data['username'],fanboy_space_split) for j in i if '@' in j]
about_edges = [(k,j[1:]) for k,i in zip(about_data['username'],about_space_split) for j in i if '@' in j]

In [None]:
about_graph = nx.Graph()
fanboy_graph = nx.Graph()

In [None]:
about_graph.add_edges_from(about_edges)
fanboy_graph.add_edges_from(fanboy_edges)

In [None]:
print(1/(float(fanboy_graph.order())/float(fanboy_graph.size())))
print(1/(float(about_graph.order())/float(about_graph.size())))

In [None]:
fanboy_cc = nx.connected_component_subgraphs(fanboy_graph)
bet_cen = nx.betweenness_centrality([i for i in fanboy_cc][0])

In [None]:
fanboy_cc = nx.connected_component_subgraphs(fanboy_graph)
clo_cen = nx.closeness_centrality([i for i in fanboy_cc][0])

In [None]:
fig, ax = matplotlib.pyplot.subplots()
ax.scatter(list(clo_cen.values()),list(bet_cen.values()))
ax.set_ylim(0.04,0.3)
ax.set_xlim(0.32,0.45)
ax.set_xlabel("Closeness Centrality")
ax.set_ylabel("Betweenness Centrality")
ax.set_yscale('log')
for i, txt in enumerate(list(clo_cen.keys())):
    ax.annotate(txt, (list(clo_cen.values())[i],list(bet_cen.values())[i]))

In [None]:
import re
fanboy_text = [re.sub("[^a-zA-Z]"," ",j).lower() for i in fanboy_space_split for j in i if (not('@' in j) and not('#' in j))]
about_text = [re.sub("[^a-zA-Z]"," ",j).lower() for i in about_space_split for j in i if (not('@' in j) and not('#' in j))]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
fc_vectorizer = CountVectorizer(stop_words='english')
fanboy_counts = fc_vectorizer.fit_transform(fanboy_text)
ac_vectorizer = CountVectorizer(stop_words='english')
about_counts = ac_vectorizer.fit_transform(about_text)

In [None]:
fc_freq = list(fc_vectorizer.vocabulary_.values())[:1000]
ac_freq = list(ac_vectorizer.vocabulary_.values())[:1000]
fc_vocab = list(fc_vectorizer.vocabulary_.keys())[:1000]
ac_vocab = list(ac_vectorizer.vocabulary_.keys())[:3000]
fanboy_freq_text = []
about_freq_text = []
for i in range(3000):
    fanboy_freq_text+=fc_freq[i]*[fc_vocab[i]]
    about_freq_text+=ac_freq[i]*[ac_vocab[i]]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
fanboy_vectorizer = TfidfVectorizer(min_df=1)
fanboy_tfidf = fanboy_vectorizer.fit_transform(fanboy_freq_text)
about_vectorizer = TfidfVectorizer(min_df=1)
about_tfidf = about_vectorizer.fit_transform(about_freq_text)

In [None]:
print(fanboy_tfidf)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
from sklearn.decomposition import NMF
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20
fanboy_nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(fanboy_tfidf)
about_nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(about_tfidf)

In [None]:
fanboy_feature_names = fanboy_vectorizer.get_feature_names()
print_top_words(fanboy_nmf, fanboy_feature_names, n_top_words)

In [None]:
about_feature_names = about_vectorizer.get_feature_names()
print_top_words(about_nmf, about_feature_names, n_top_words)