# TOC

  __Chapter 6 - Text classification__

1. [Import](#Import)
1. [Text classification](#Text-classification)
    1. [Basic modeling](#Basic-modeling)
        1. [Sampling](#Sampling)
        1. [Naive Bayes](#Naive-Bayes)
1. [Text clustering](#Text-clustering)
    1. [K-means](#K-means)
1. [Topic modeling in text](#Topic-modeling-in-text)
1. [](#)
1. [](#)


# Import

<a id = 'Import'></a>

In [1]:
# Standard libary and settings
import os
import sys
import importlib
import itertools
import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# Data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# Modeling extensions
import nltk

# Visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_style("whitegrid")

# Text classification


<a id = 'Text-classification'></a>

In [2]:
#
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv


def preprocessing(text):
    text = text.decode("utf8")

    # tokenize into words
    tokens = [
        word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)
    ]

    # remove stopwords
    stop = stopwords.words("english")
    tokens = [tokens for token in tokens if token not in stop]

    # remove words < 3 letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmetize
    lmtzr = WordNetLemmatizer()
    tokens - [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text = " ".join(tokens)
    return preprocessed_text

In [3]:
#
smsdata = open("SMSSpamCollection")
smsdata_data = []
sms_labels = []
csv_reader = csv.reader(sms, delimiter="\t")
for line in csv_reader:
    # add sms ID
    sms_labels.append(line[0])

    # call preprocessing function
    sms_data.append(preprocess(line[1]))
sms.close()

FileNotFoundError: [Errno 2] No such file or directory: 'SMSSpamCollection'

## Basic modeling

<a id = 'Basic-modeling'></a>

### Sampling

Split data into train data and test data


<a id = 'Sampling'></a>

In [None]:
# train/test split
trainset_size = int(round(len(sms_data) * 0.70))
X_train = np.array(["".join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_data[0:trainset_size]])
X_test = np.array(["".join(el) for el in sms_data[trainset_size + 1 : len(sms_data)]])
y_test = np.array([el for el in sms_data[trainset_size + 1 : len(sms_data)]])

print(X_train[:5])
print(y_train[:5])

In [None]:
# create term-document matrix using bag of words
from sklearn.feature_extraction.text import CountVectorizer

sms_exp = []
for line in sms_list:
    sms_exp.append(preprocessing(line[1]))
vectorizer = CountVectorizer(min_df=1)
xExp = vectorizer.fit_transform(sms_exp)
print("||".join(vectorizer.get_feature_names()))
print(xExp.toarray())

In [None]:
# utilize TF-IDF to downscale weights for words that occur in many documents
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    min_df=2,
    ngram_range=(1, 2),
    stop_words="english",
    strip_accents="unicode",
    norm="l2",
)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Naive Bayes

Naive Bayes, in a basic sense, assigns a class label to a sample based on the conditional probability class given the sample's feature values. The 'naive' aspect of the algorithm stems from the fact that the model assumes all features are independent from each other, which is particularly counter-intuitive in the context of text analysis. Despite this, Naive Bayes typically performs very well.


<a id = 'Naive-Bayes'></a>

In [None]:
# fit model and make predictions
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = clf.predict(X_test)
print(y_nb_predicted)

In [None]:
# evaluate results
print("\nConfusion matrix\n")
cm = confusion_matrix(y_test, yPred)
print(cm)

print("\nClassification report\n")
cm = classification_report(y_test, y_nb_predicted)
print(cm)

In [None]:
# observe features that contribute to the positive and negative predictions
feature_names = vectorizer.get_feature_names()
coefs = clf.coef_
intercept = clf.intercept_

coefs_with_fns = sorted(zip(clf.coef_[0], features_names))
n = 10

top = zip(coefs_with_fns[:n], coefs_with_fns[: -(n + 1) : -1])
for ((coef_1, fn_2), (coef_2, fn_2)) in top:
    print()

# Text clustering

Text clustering is often used to group together a large corpus of documents in an unsupervised fashion.



<a id = 'Text-clustering'></a>

## K-means


<a id = 'K-means'></a>

In [None]:
#
from sklearn.cluster import KMeans, MiniBatchKMeans

true_k = 5
km = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)
kmini = MiniBatchKMeans(
    n_clusters=true_k,
    init="k-means++",
    max_iter=100,
    n_init=1,
    init_size=1000,
    batch_Size=1000,
    verbose=opts.verbose,
)

km_model = km.fit(X_train)
kmini_model = kmini.fit(X_train)

print("K-means clustering (full dataset)")
clustering = collection.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)

print("K-means clustering (mini batch)")
clustering = collection.defaultdict(list)
for idx, label in enumerate(kmini_model.labels_):
    clustering[label].append(idx)

# Topic modeling in text

In many industries, there are large unlabeled text documents datasets. Topic modeling is one approach toward addressing the need to categorize the documents. Latent Dirichlet allocation (LDA) and Latent semantic indexing (LSI) are two techniques often used to determine the topic of a given document. A Python library called gensim implements these algorithms.


<a id = 'Topic-modeling-in-text'></a>

In [1]:
#
from gensim import corpora, models, similarities
from itertools import chain
from nltk.corpus import stopwords
from operator import itemgetter
import re

# read documents and remove stop words
document = [document for document in sms_data]
stoplist = stopwords.words("english")
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

In [None]:
# convert to bag of words and TF-IDF
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
## given a number of topics, the models use all documents in the corpus to build identify topics
# LSI model
lis = models.LisaModel(corpus_tfidf, id2word=dictionary, num_topics=100)
lsi.print_topics(20)

# LDA model
n_topics = 5
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)

# A


<a id = ''></a>

In [None]:
#

# A


<a id = ''></a>

In [None]:
#

# A


<a id = ''></a>

In [None]:
#

# A


<a id = ''></a>

In [None]:
#