In [1]:
import os, re
import spacy
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.base import TransformerMixin

# Folder path
folder = './run_data/'
folder_traing = './training/'
dest_file = './final/final_content.csv'

%matplotlib inline

In [2]:
entries = os.listdir(folder_traing)
print("Total files: ", len(entries))

# Print out file names
print([entry for entry in entries[:5]])

Total files:  25737
['501510283588685824.txt', '564880695911124992.txt', '328599052881121280.txt', '454427476240769025.txt', '489080940820701184.txt']


In [3]:
# Read file and push to list\n",
list_text = []
# list_text.append([f.read() with open(file,'r') as f for file in entries"]

for entry in entries:
    file = folder_traing + entry
    with open(file, 'r') as f:
        data = f.read()
#         print("Read: ", len(data))
        list_text.append(data)
#         print("====================\n")

In [4]:
# Split data into training & testing
X_train = list_text[:11000]
X_test = list_text[11001::]

### 1. CleanText (Spacy)

In [5]:
nlp = spacy.load("en_core_web_sm")
stemmer = SnowballStemmer('english',ignore_stopwords=True)

In [6]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
STOPLIST = text.ENGLISH_STOP_WORDS.union(punc)
# STOPLIST = STOPLIST.union(extra)
SYMBOLS = "?:!.,;(){}[]%"

In [7]:
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text
def tokenizeText(sample):
    tokens = nlp(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [re.sub(r"([0-9]+)","",tok) for tok in tokens]
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [8]:
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

### 2. Stopwords & TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words = STOPLIST)
X = vectorizer.fit_transform(X_train)

In [None]:
word_features = vectorizer.get_feature_names()
len(word_features)

65330

In [None]:
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def mf_tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [None]:
vectorizer2 = TfidfVectorizer(stop_words=STOPLIST,tokenizer=tokenizeText)
X2 = vectorizer2.fit_transform(X_train)
word_features2 = vectorizer2.get_feature_names()
len(word_features2)

  'stop_words.' % sorted(inconsistent))


In [None]:
vectorizer3 = TfidfVectorizer(stop_words=STOPLIST, tokenizer=tokenizeText,max_features=2000)
X3 = vectorizer3.fit_transform(X_train)
word_features3 = vectorizer3.get_feature_names()
len(word_features3)

### 2. KMeans

In [None]:
kmeans = KMeans(n_clusters = 10, n_init = 5, n_jobs = -1)
kmeans.fit(X3)

In [None]:
predicted = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
common_words = centroids.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(word_features3[word] for word in centroid))

In [None]:
X3

In [None]:
type(predicted)

In [None]:
frame = pd.DataFrame(predicted,columns=['label'])
frame.to_csv('./final/label.csv',sep='|',index=None)

In [None]:
frame = pd.DataFrame(centroids)
frame.to_csv('./final/centroids.csv',sep='|',index=None)