База данных писем скачана и расположена рядом с файлом ipynb в файле database.sqlite. Посмотрим на раздел Emails.

In [64]:
import sqlite3
import pandas as pd
conn = sqlite3.connect('database.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM Emails")
field_names = [i[0] for i in c.description]
print(field_names)

['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom', 'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased', 'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass', 'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc', 'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber', 'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull', 'ExtractedBodyText', 'RawText']


Нас интересует поле RawText, в котором хранятся тексты писем. Извлечем эти данные из базы данных.

In [65]:
Emails = c.fetchall()
Emails2 = [tuple(elm,) for elm in Emails]
Emails3 = pd.DataFrame(Emails2, columns = [u'Id', u'DocNumber', u'MetadataSubject', u'MetadataTo', u'MetadataFrom', u'SenderPersonId', u'MetadataDateSent', u'MetadataDateReleased', u'MetadataPdfLink', u'MetadataCaseNumber', u'MetadataDocumentClass', u'ExtractedSubject', u'ExtractedTo', u'ExtractedFrom', u'ExtractedCc', u'ExtractedDateSent', u'ExtractedCaseNumber', u'ExtractedDocNumber', u'ExtractedDateReleased', u'ExtractedReleaseInPartOrFull', u'ExtractedBodyText', u'RawText'])

emails = Emails3.RawText.values.tolist()
text = ' '.join([word for word in emails])

Предобработка текста. Вначале удалим из текста все вспомогательные символы, которые не несут смысловой нагрузки.

In [66]:
from gensim import corpora, models, similarities
import re
import numpy as np
def rmUselessSymbols(texts): 
    ctext = re.sub(r'[[]$$~`*=-_+&#(){}<>,\.!?;:\'"/\\\_|%', '', texts) 
    ctext = ctext.replace("^", " ")
    ctext = ctext.replace("°", " ")
    ctext = ctext.replace("•", " ")
    return ctext
EmailsNonAlpha = rmUselessSymbols(text)

Посмотрев на текст, можно увидеть стандартный текст во многим письмах. Вырежем его также. Уменьшим размер датасета до 1000 для приемлимости времени работы программы.

In [69]:
EmailsNonAlpha = EmailsNonAlpha.split("UNCLASSIFIED")
EmailsList = []
for i in range(1000):
    EmailsList.append(EmailsNonAlpha[i])

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def deleteTechText(text):
    text = re.sub(r"<[\w\.-]+@[\w\.-]+>", "EMAIL", text.lower())
    text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3])(:|-)[0-5][0-9]", "", text) 
    text = re.sub(r" [0-9]+\.?[0-9]*", " ", text)
    text = re.sub(r"https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)", "", text)
    
    text = re.sub("\d", " ", text)
    text = re.sub(r"\W+", " ", text).strip()
    text = (lemmatizer.lemmatize(word) for word in text.split() if len(word) > 3)
    text = [word for word in text if not (word in stop_words)] # remove stop words
    tags = nltk.pos_tag(text)
    text = " ".join([tag[0] for tag in tags if ("NN" in tag[1]) or ("VB" in tag[1])])
    return text

In [73]:
wordsBigr = " ".join(deleteTechText(fragment) for fragment in EmailsList).strip().split()
msg_temp = (deleteTechText(fragment) for fragment in EmailsList if fragment)
messages = np.array([message for message in msg_temp if len(message.split()) > 3])

In [None]:
Найдем часто встречаемые биграммы в датасете.

In [75]:
from itertools import islice, zip_longest
from collections import Counter

words = re.findall("\w+", str(wordsBigr))
print(Counter(zip(words, islice(words, 1, None))).most_common(10))

[(('department', 'state'), 1001), (('state', 'dept'), 991), (('dept', 'produced'), 991), (('produced', 'house'), 991), (('house', 'select'), 991), (('agreement', 'information'), 991), (('information', 'redaction'), 991), (('redaction', 'foia'), 991), (('case', 'date'), 980), (('select', 'benghazi'), 979)]


Найдем коллокации:

In [76]:
import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsBigr)
finder.nbest(bigram_measures.pmi, 20)

[('_fibril', 'prolonged'),
 ('abetlin', 'hum'),
 ('adms', 'stavridis'),
 ('africom', 'planner'),
 ('alon', 'olson'),
 ('ambs', 'serbia'),
 ('annoyance', 'qataar'),
 ('anthony', 'cordesinan'),
 ('apache', 'longbow'),
 ('ashamed', 'shalgam'),
 ('assembly', 'terry'),
 ('attract', 'investor'),
 ('baitul', 'mokarram'),
 ('bark', 'ruggles'),
 ('bashing', 'hammer'),
 ('bassador', 'chri'),
 ('belated', 'honoring'),
 ('bianna', 'golodryga'),
 ('blow', 'qaddall'),
 ('bondy', 'tsou')]

In [79]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
matrix = vectorizer.fit_transform(messages).toarray()
print(matrix.shape)

(993, 804)


In [81]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=10,affinity='euclidean', linkage='complete')
preds = model.fit_predict(matrix)
print(list(preds))

[1, 4, 7, 4, 7, 4, 7, 4, 1, 4, 0, 0, 4, 7, 4, 7, 0, 7, 4, 0, 4, 1, 4, 1, 4, 1, 4, 7, 4, 7, 0, 7, 4, 0, 4, 5, 4, 3, 4, 3, 4, 1, 4, 2, 4, 2, 4, 2, 4, 7, 4, 3, 4, 1, 4, 2, 4, 2, 4, 2, 4, 7, 4, 1, 4, 0, 4, 0, 4, 0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 2, 4, 2, 4, 2, 4, 7, 4, 1, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 1, 4, 1, 4, 2, 4, 2, 4, 2, 4, 2, 4, 1, 4, 1, 4, 1, 4, 3, 4, 3, 4, 0, 4, 8, 4, 1, 4, 1, 1, 4, 7, 4, 1, 4, 7, 4, 7, 4, 2, 4, 7, 4, 1, 4, 4, 4, 1, 4, 4, 4, 4, 4, 5, 4, 1, 4, 1, 7, 4, 7, 4, 1, 4, 1, 7, 4, 7, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1, 4, 3, 4, 3, 4, 1, 4, 4, 4, 3, 4, 3, 4, 1, 4, 3, 4, 3, 4, 1, 4, 1, 4, 3, 4, 3, 4, 3, 4, 8, 4, 3, 4, 3, 4, 3, 4, 2, 4, 2, 4, 7, 4, 2, 4, 2, 4, 7, 4, 1, 4, 3, 4, 3, 4, 3, 4, 1, 4, 3, 4, 3, 4, 3, 4, 3, 4, 5, 4, 1, 4, 8, 4, 3, 4, 3, 4, 0, 4, 1, 4, 3, 4, 3, 4, 9, 4, 0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 3, 

In [86]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=800)
features = svd.fit_transform(matrix)
svd.explained_variance_ratio_.sum()

0.99999999999999944

In [87]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=5, random_state=1)
preds = model.fit_predict(matrix)
print(preds[:25])
print("=========================")
model = KMeans(n_clusters=10, random_state=1)
preds = model.fit_predict(matrix)
print(preds[:25])
print("=========================")
model = KMeans(n_clusters=15, random_state=1)
preds = model.fit_predict(matrix)
print(preds[:25])
print("=========================")
model = KMeans(n_clusters=20, random_state=1)
preds = model.fit_predict(matrix)
print(preds[:25])
print("=========================")
model = KMeans(n_clusters=25, random_state=1)
preds = model.fit_predict(matrix)
print(preds[:25])

[4 0 2 0 2 0 2 0 4 0 4 4 0 0 0 2 0 2 0 0 0 4 0 0 0]
[9 1 5 1 5 1 5 1 9 1 9 1 1 1 1 5 1 5 1 1 1 9 1 0 1]
[ 9  1 13  1 13  1 13  1  9  1  9  1  1 13  1 13  1 13  1  1  1  9  1  0  1]
[16  0  7  0  7  0  7  0 16  0 16  0  0  7  0  7  0  7  0  0  0 16  0  0  0]
[18  0 24  0 24  0 16  0 13  0 13  0  0 24  0 24  0 16  0  0  0  0  0  0  0]


У 10 кластеров наилучший результат распределения.

In [88]:
model = KMeans(n_clusters=10, random_state=1)
preds = model.fit_predict(matrix)
print(preds[:25])

[9 1 5 1 5 1 5 1 9 1 9 1 1 1 1 5 1 5 1 1 1 9 1 0 1]


In [89]:
def topNWords(cluster,n):
    joinedCluster = " ".join(cluster)
    words = joinedCluster.split()
    count = Counter(words)
    top_10 = count.most_common(10)
    top_words = re.findall(r'\b[a-z]\w+\b', str(top_10))
    text_top = " ".join(word for word in top_words)
    
    return text_top

for i in range(10):
    print("cluster size: ", messages[preds == i].shape[0])
    print(topNWords(messages[preds == i], 10))
    print()

cluster size:  116
state said benghazi security attack department stevens house libya romney

cluster size:  602
state department house benghazi case date agreement dept produced select

cluster size:  32
sent september monica hanley state message call king mailto hanleymr

cluster size:  32
state rice president benghazi iran information gregory think government case

cluster size:  4
libya qaddafi egypt part crisis state islamist force source rebel

cluster size:  109
state source government magariaf libya security information benghazi minister force

cluster size:  3
ambassador rice press morning nation sullivan meet transcript state sent

cluster size:  4
death issue state bloomberg say sent october said mill cheryl

cluster size:  20
clinton secretary question yeah state think going people said know

cluster size:  71
state sent sullivan september benghazi jacob department house date information



In [93]:
from sklearn.mixture import GaussianMixture
clustering = GaussianMixture(n_components=10)
clustering.fit(features)
preds = clustering.predict(features)
for i in range(10):
    print("cluster size: ", messages[preds == i].shape[0])
    print(topNWords(messages[preds == i], 10))

cluster size:  3
state david department case date dept produced house select benghazi
cluster size:  867
state benghazi department house information agreement case date produced dept
cluster size:  6
country security effort team need diplomacy state risk crisis event
cluster size:  32
sent september monica hanley state message call king mailto hanleymr
cluster size:  36
magariaf cabinet minister state government president source abushagur according october
cluster size:  8
death leader laden state source libya concerned group aqim information
cluster size:  5
source official attack group magariaf state information benghazi security government
cluster size:  8
libya state source aqim rebel weapon report concerned intended equipment
cluster size:  8
source security magariaf attack state consulate government cover militia benghazi
cluster size:  20
clinton secretary question yeah state think going people said know
