In [None]:
from IPython.core.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# 1. Read data

 Reading the data and selecting right columns.

In [None]:
data = pd.read_csv("../input/uci-news-aggregator.csv")
data.head()

In [None]:
X_raw, y = data['TITLE'], data['CATEGORY']

In [None]:
value_counts = dict(y.value_counts())
targets_labels = value_counts.keys()
ind = range(len(targets_labels))
plt.bar(ind, value_counts.values())
plt.title("Categories count")
plt.xticks(ind, targets_labels)
plt.show()

# 2. Prepare data

## 2.1. Vectorizing the data

We use a vector representation of documents (called a "bag of words" model). The document is represented in the form of a vector, whose dimension is equal to the number of all available words. Each word corresponds to one dimension, if the word is present in the document, the vector has a corresponding value on that dimension.
In the basic implementation, this is the number of occurrences of a given word in a document (CountVectorizer in sklearn).
Below I am using an enhanced version of this model (TfidfVectorizer) which, thanks to the TF-IDF factor, also takes into account the relevance of the word. This factor is based on the assumption that the words appearing in many different documents are less important than those that appear in fewer numbers.

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X_raw)

## 2.1. Spliting data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 3. Training model

I'm using LinearSVC with is a Support Vector Classification model with linear kernel. I've tried using other kernels, but it takes to long to train, while this still gives a sufficient results. Suppor Vector Machines algorithms works well with high dimension data (like text documents).

We could have use any other popular, classification algorithm like LogisticRegression, RandomForest, GaussianNaiveBayer or MultipleLayerPerceptron. Or even a deep learning alogrithm, e.g. CNN with word embedding layer tends to get really good results for text classification.

In [None]:
clf = LinearSVC()
clf.fit(X_train, y_train)

# 4. Results

## 4.1. Classification metrics

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print (classification_report(y_test, y_pred))

## 4.2. Confusion matrix

In [None]:
def plot_confusion_matrix(y_true, y_pred, targets_labels=None):
    targets_labels = list(targets_labels)
    if not targets_labels:
        targets_labels = list(set(y_true))
    num_classes = len(targets_labels)
    cdata = confusion_matrix(y_true, y_pred, labels=targets_labels)
    cdata = cdata / cdata.sum(axis=1).astype(float)
    heatmap = plt.pcolor(cdata, cmap="PuBu")
    plt.title("Confusion matrix")
    plt.colorbar(heatmap)
    for y in range(cdata.shape[0]):
        for x in range(cdata.shape[1]):
            plt.text(x + 0.5, y + 0.5, '{0:.2f}%'.format((cdata[y, x] * 100)),
                     horizontalalignment='center',
                     verticalalignment='center',
                     )

    tick_marks = np.arange(num_classes) + 0.5
    plt.xticks(tick_marks, targets_labels)
    plt.yticks(tick_marks, targets_labels)
    plt.show()
    
plot_confusion_matrix(y_test, y_pred, targets_labels)