This is a simple implementation of Natural Language Processing to predict the categories of articles from BBC.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading the dataset and checking its head.

In [None]:
articles = pd.read_csv('/kaggle/input/bbc-fulltext-and-category/bbc-text.csv')
articles.head()

# Exploratory Data Analysis

In [None]:
articles.describe()

What are the categories in the dataset?

In [None]:
articles['category'].unique()

In [None]:
articles.groupby('category').describe()

Exploring the dataset based on text length.

In [None]:
articles['length'] = articles['text'].apply(len)
articles.head(10)

Checking the disribution of text length

In [None]:
articles['length'].plot(bins=50,kind='hist')

In [None]:
articles['length'].plot(bins=100,kind='hist')

Finding the longest article in the dataset and its length

In [None]:
articles['length'].max()

In [None]:
print(articles[articles['length'] == articles['length'].max()]['text'].iloc[0])

In [None]:
print(articles[articles['length'] == articles['length'].max()]['category'])

Checking the distribution of text length from each categories

In [None]:
articles.hist(column='length', by='category', bins=50, figsize=(12,8))

In [None]:
articles.groupby('category').describe()

Boxplot of text length from each categories

In [None]:
sns.boxplot(x='category',y='length',data=articles,palette='coolwarm')

Boxplot of text length from each categories excluding outliers

In [None]:
sns.boxplot(x='category',y='length',data=articles,palette='coolwarm',showfliers=False)

# Text preprocessing

In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords

In [None]:
import string

Creating a text processing function

In [None]:
def text_process(art):
    """
    Takes in a string of text, then perform the following:
    1. Remove all punctuations
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    
    nopunc = [char for char in art if char not in string.punctuation]
    
    nopunc = ''.join(nopunc)
    
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(articles['text'])

print(len(bow_transformer.vocabulary_))

In [None]:
articles_bow = bow_transformer.transform(articles['text'])

In [None]:
print('Shape of Sparse Matrix: ', articles_bow.shape)
print('Amount of Non-Zero Occurences: ',articles_bow.nnz)

In [None]:
sparsity = (100.0 * articles_bow.nnz / (articles_bow.shape[0]*articles_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(articles_bow)

In [None]:
articles_tfidf = tfidf_transformer.transform(articles_bow)
print(articles_tfidf.shape)

# Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

categorize_model = MultinomialNB().fit(articles_tfidf,articles['category'])

# Data Pipeline

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)), # strings to token integer counts
    ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier
])

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

art_train, art_test, cat_train, cat_test = train_test_split(articles['text'],articles['category'])

print(len(art_train), len(art_test), len(cat_train) + len(cat_test))

# Training the Model

In [None]:
pipeline.fit(art_train, cat_train)

# Prediction and Evaluation

In [None]:
predictions = pipeline.predict(art_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(predictions,cat_test))

This simple model gives us a really good prediction to categorize each articles.