# Load Data and convert to JSON

In [44]:
import os
import pprint

dataDir = './dataset'
dataset = os.listdir(dataDir)
      
jsonDataSet = dict()


for count, dataFile in enumerate(dataset):

    if dataFile.split(".")[1] == "txt":
        
        fileobj = open(dataDir + "/" + dataFile,'r')
        contents = fileobj.readlines()
        
        title = contents[0].split("-----")[0].split(".")[1]
        date = contents[0].split("-----")[0].split(".")[0][12:]
        text = contents[0].split("-----")[1]
        
        file = {
            "title": title,
            "date": date,
            "text": text,
        }
        
        jsonDataSet.update({count: file})
        
    else:
        continue
        
pprint.pprint(jsonDataSet)

{0: {'date': '13 April, 2003',
     'text': ' A man who gave his name as Wallace Wilson was arrested at '
             '1730hrs on 10 April, 2003 at Camp George West Military '
             'Reservation in Denver, Colorado. Wilson was able to obtain '
             'access to this military reservation using personal '
             "identification in the form of a Colorado driver's license and a "
             'Social security card. He also used identification for the truck '
             'he was driving that is owned by his employer, the Apex Paper '
             'Products Company in Denver, Colorado. This company has a '
             'contract to supply paper products to Camp George West. A man by '
             'the name wallace Wilson is known to be a member of a domestic '
             'militia group known as Aryan Nations. Wilson was arrested by MPs '
             'when he was seen loading boxes on his Apex truck. The boxes were '
             'being taken from an apparently unlock

# Perform Bag of Words

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

for key, value in jsonDataSet.items():
    vect = CountVectorizer(ngram_range=(1, 4), stop_words='english')
    vect.fit(value["text"].split("."))
    X = vect.transform(value["text"].split("."))
    value.update({"matrix": X.toarray()})
    value.update({"bow": vect.inverse_transform(X)[0]})

In [79]:
for key, value in jsonDataSet.items():
    print("\n", key, value["bow"], value["matrix"], "\n")


 0 ['10' '10 april' '10 april 2003' '10 april 2003 camp' '1730hrs'
 '1730hrs 10' '1730hrs 10 april' '1730hrs 10 april 2003' '2003'
 '2003 camp' '2003 camp george' '2003 camp george west' 'april'
 'april 2003' 'april 2003 camp' 'april 2003 camp george' 'arrested'
 'arrested 1730hrs' 'arrested 1730hrs 10' 'arrested 1730hrs 10 april'
 'camp' 'camp george' 'camp george west' 'camp george west military'
 'colorado' 'denver' 'denver colorado' 'gave' 'gave wallace'
 'gave wallace wilson' 'gave wallace wilson arrested' 'george'
 'george west' 'george west military' 'george west military reservation'
 'man' 'man gave' 'man gave wallace' 'man gave wallace wilson' 'military'
 'military reservation' 'military reservation denver'
 'military reservation denver colorado' 'reservation' 'reservation denver'
 'reservation denver colorado' 'wallace' 'wallace wilson'
 'wallace wilson arrested' 'wallace wilson arrested 1730hrs' 'west'
 'west military' 'west military reservation'
 'west military reservatio

# TFIDF

In [131]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline

dataFileObjects = []

for key, value in jsonDataSet.items():
    #dataFile_tfidf = make_pipeline(CountVectorizer(ngram_range=(1, 4), stop_words='english'),
#                              TfidfTransformer()).fit_transform(value["text"].split("."))
    #dataFileObjects.append(dataFile_tfidf)
    vect = CountVectorizer(ngram_range=(1, 4), stop_words='english')
    vect.fit(value["text"].split("."))
    X = vect.transform(value["text"].split("."))
    value.update({"tf_idf": X.toarray()})
    value.update({"bow": vect.inverse_transform(X)[0]})
#     value.update({"tf_idf": dataFile_tfidf.toarray()})

# KMeans

In [None]:
from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=10).fit()

# DBSCAN

In [119]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


# #############################################################################
# Generate sample data
centers = [list().extend([value["tf_idf"], value["bow"]]) for key, value in jsonDataSet.items()]
X, labels_true = make_blobs(n_samples=25, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt
%matplotlib inline

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

ValueError: Expected 2D array, got 1D array instead:
array=[None None None None None None None None None None None None None None
 None None None None None None None None None None None].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.