# Machine Learning on Text: Clustering Assignment

In [1]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

In [2]:
def preprocess(docs): 
    lemmatizer = WordNetLemmatizer() 
    stemmer = SnowballStemmer('english') 
    
    preprocessed = []
    for doc in docs: 
        tokenized = word_tokenize(doc)
        cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) for token in tokenized 
               if not token.lower() in stopwords.words('english') 
               if token.isalpha()]

        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
        
    return preprocessed

### Ingest the company_profiles data set into a corpus.

In [3]:
path = 'company_profiles'
DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(path, DOC_PATTERN)

### Create a list of documents by extracting the raw text for each fileid in the corpus.

In [4]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Preprocess the documents, including the steps below.

- Word tokenize the document.
- Lowercase all tokens.
- Lemmatize and stem the tokens.
- Remove stop words and punctuation.

In [5]:
preprocessed = preprocess(docs)


KeyboardInterrupt



### TF-IDF vectorize the preprocessed documents.

In [None]:
preprocessed

### Determine the optimal number of clusters using the Yellowbrick library's KElbow Visualizer and a KMeans clustering algorithm.

### Perform K-Means Clustering using the optimal number of clusters determine in the previous step.

### Perform Agglomerative Clustering using the same number of clusters.

### Choose one of the three topic modeling approaches covered. Cluster into the optimal number of clusters and extract 5 keywords that represent the documents in each cluster.