# Applications of ML: Natural Language Processing

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

### [Bag of Words](https://en.wikipedia.org/wiki/Bag-of-words_model) / [Vector Space](https://en.wikipedia.org/wiki/Vector_space_model) Representation
<img src="http://uc-r.github.io/public/images/analytics/feature-engineering/bow-image.png" width="20%">

In [None]:
X = ["it is a puppy and it is extremely cute",
     "it is a cat."]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X)

In [None]:
X_bag_of_words = vectorizer.transform(X)
X_bag_of_words.shape

In [None]:
X_bag_of_words

In [None]:
X_bag_of_words.toarray()

In [None]:
vectorizer.vocabulary_

`Size of our vocabulary`

In [None]:
len(vectorizer.vocabulary_)

In [None]:
vectorizer.get_feature_names()

In [None]:
vectorizer.inverse_transform(X_bag_of_words)

## [tf–idf](https://en.wikipedia.org/wiki/Tf–idf) (Excurse)
Term Frequency–Inverse Document Frequency is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
tf-idf rescales words that occur in many documents to have less weight (see [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) for scikit implementation). 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X)

In [None]:
tfidf_vectorizer.transform(X).toarray()

## Case Study - Spam Detection

We load the [SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) from our `data` directory

and perform a preprocessing to split the data array into two parts:

1. `text`: A list of lists, each containing the contents of a document
2. `y`: the labels; 1 representing spam and 0 for a non-spam message. 

In [None]:
lines = pd.read_csv ("https://raw.githubusercontent.com/gesiscss/WDCNLP/main/data/SMSSpamCollection.tsv", sep='\t', header=None).values.tolist()

In [None]:
text = [x[1] for x in lines]
y = [int(x[0] == "spam") for x in lines]

## Display the first 5 documents 

In [None]:
# %load solutions/l3_1.py
text[:5]

## Display the first 5 labels 

In [None]:
# %load solutions/l3_2.py
y[:5]

## Compute the number of spam non-spam messages:

In [None]:
# %load solutions/l3_3.py
pd.value_counts(y)

## Split the Data into a training and test set

In [None]:
# %load solutions/l3_4.py
from sklearn.model_selection import train_test_split

text_train, text_test, y_train, y_test = train_test_split(text, y, 
                                                          random_state=42,
                                                          test_size=0.25,
                                                          stratify=y)

## Use the CountVectorizer to parse the text data into a bag-of-words model

In [None]:
# %load solutions/l3_5.py
vectorizer = CountVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

## What is the size of our vocabulary?

In [None]:
# %load solutions/l3_6.py
len(vectorizer.vocabulary_)

## What is the dinensionality of text_train?

In [None]:
# %load solutions/l3_7.py
X_train.shape

## ... and what is the dinensionality of X_test?

In [None]:
# %load solutions/l3_8.py
X_test.shape

## Train a [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) model on 'X_train'

In [None]:
# %load solutions/l3_9.py
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver = "lbfgs")
clf.fit(X_train, y_train)

## Compute the accuracy on the test data

In [None]:
# %load solutions/l3_10.py
from sklearn.metrics import accuracy_score

predicted_test = clf.predict(X_test)
accuracy_score(y_test, predicted_test)

Function to visualize coefficients of a linear model.

In [None]:
def visualize_coefficients(classifier, feature_names, n_top_features=25):
    # see http://github.com/amueller/mglearn/blob/cbae37d906261dad173cbc6696dcef69dfd0cbaf/mglearn/tools.py
    # and http://github.com/amueller/scipy-2018-sklearn
    coef = classifier.coef_.ravel()
    positive_coefficients = np.argsort(coef)[-n_top_features:]
    negative_coefficients = np.argsort(coef)[:n_top_features]
    interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
    # plot them
    plt.figure(figsize=(15, 5))
    colors = ["tab:orange" if c < 0 else "tab:blue" for c in coef[interesting_coefficients]]
    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 2 * n_top_features + 1), feature_names[interesting_coefficients], rotation=60, ha="right");

In [None]:
visualize_coefficients(clf, vectorizer.get_feature_names(), n_top_features = 10)

## Exercise: Only consider words that occure in at least 2 documents

Use the [min_df](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) argument in CountVectorizer.

In [None]:
# %load solutions/l3_11.py
vectorizer = CountVectorizer(min_df=2)
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

clf = LogisticRegression(solver = "lbfgs")
clf.fit(X_train, y_train)

## Exercise: What is the size of our vocabulary?

In [None]:
# %load solutions/l3_12.py
len(vectorizer.vocabulary_)

## Exercise: Comppute the test accuracy using [score](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score)

In [None]:
# %load solutions/l3_13.py
clf.score(X_test, y_test)

## Exercise: Visualize the top 15 coefficients

In [None]:
# %load solutions/l3_14.py
visualize_coefficients(clf, vectorizer.get_feature_names(),  n_top_features = 15)

## Bonus Exercise: Use the [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) and compare your results

In [None]:
# %load solutions/l3_15.py
vectorizer = TfidfVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

clf = LogisticRegression(solver = "lbfgs")
clf.fit(X_train, y_train)


visualize_coefficients(clf, vectorizer.get_feature_names(), n_top_features = 10)
print("Accuracy: " + str(clf.score(X_test, y_test)))