In [3]:
import os

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

# Folder path
folder = './run_data/'
folder_traing = './training/'
dest_file = './final/final_content.csv'

%matplotlib inline

In [4]:
entries = os.listdir(folder_traing)
print("Total files: ", len(entries))

# Print out file names
print([entry for entry in entries[:5]])

Total files:  21512
['100634927493693440.txt', '100691512974778368.txt', '100997287899172864.txt', '101321977507229696.txt', '101340793482125312.txt']


In [5]:
# Read file and push to list\n",
list_text = []
# list_text.append([f.read() with open(file,'r') as f for file in entries"]

for entry in entries:
    file = folder_traing + entry
    with open(file, 'r') as f:
        data = f.read()
#         print("Read: ", len(data))
        list_text.append(data)
#         print("====================\n")

In [6]:
# Split data into training & testing
X_train = list_text[:11000]
X_test = list_text[11001::]

### 1. Stopwords & TfidfVectorizer

In [8]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
m_stopwords = text.ENGLISH_STOP_WORDS.union(punc)

vectorizer = TfidfVectorizer(stop_words = m_stopwords)
X = vectorizer.fit_transform(X_train)

In [10]:
word_features = vectorizer.get_feature_names()
len(word_features)

81468

In [12]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def mf_tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [13]:
vectorizer2 = TfidfVectorizer(stop_words=m_stopwords,tokenizer=mf_tokenize)
X2 = vectorizer2.fit_transform(X_train)
word_features2 = vectorizer2.get_feature_names()
len(word_features2)

  'stop_words.' % sorted(inconsistent))


57107

In [15]:
vectorizer3 = TfidfVectorizer(stop_words=m_stopwords, tokenizer=mf_tokenize,max_features=2000)
X3 = vectorizer3.fit_transform(X_train)
word_features3 = vectorizer3.get_feature_names()
len(word_features3)

  'stop_words.' % sorted(inconsistent))


2000

### 2. KMeans

In [16]:
kmeans = KMeans(n_clusters = 8, n_init = 5, n_jobs = -1)
kmeans.fit(X3)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=15, n_init=5, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [22]:
predicted = kmeans.labels_
centroids = kmeans.cluster_centers_

In [28]:
common_words = centroids.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(word_features3[word] for word in centroid))

0 : exercis, workout, weight, run, fit, muscl, bodi, minut, say, train
1 : patient, hospit, doctor, care, medic, said, say, surgeri, physician, medicar
2 : vaccin, polio, flu, measl, children, case, outbreak, immun, diseas, said
3 : ebola, virus, outbreak, liberia, said, west, africa, infect, sierra, leon
4 : brain, cell, research, gene, alzheim, studi, say, mice, said, scientist
5 : insur, coverag, health, law, plan, exchang, care, enrol, afford, state
6 : food, eat, obes, diet, weight, calori, fat, sugar, studi, say
7 : drug, compani, fda, patient, use, said, pharmaceut, say, prescript, medic
8 : recip, pepper, oil, chees, dish, oliv, tomato, salt, protein, calori
9 : cancer, breast, women, screen, studi, patient, risk, test, said, research
10 : say, children, women, studi, parent, said, peopl, research, like, babi
11 : virus, infect, mer, case, hiv, cdc, said, flu, peopl, diseas
12 : nhs, said, hospit, care, servic, patient, trust, health, staff, england
13 : smoke, tobacco, cigaret

In [36]:
X3

<15000x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1887169 stored elements in Compressed Sparse Row format>

In [37]:
type(predicted)

numpy.ndarray

In [40]:
# frame = pd.DataFrame(predicted,columns=['label'])
# frame.to_csv('./final/label.csv',sep='|',index=None)

In [42]:
# frame = pd.DataFrame(centroids)
# frame.to_csv('./final/centroids.csv',sep='|',index=None)