# Text Classification - OneClass Classificaiton

The one-class algorithms are based on recognition since their aim is to recognize data from a particular class, and reject data from all other classes. This is accomplished by creating a boundary that encompasses all the data belonging to the target class within itself, so when a new sample arrives the algorithm only has to check whether it lies within the boundary or outside and accordingly classify the sample as belonging to the target class or the outlier.

Things we are going to discuss:

1. Data Preparation 
2. Cleaning and Tokenization
3. Feature Extraction
4. Train One-class classificaiton model
5. Predict one-class on test data

In [None]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.utils import shuffle
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 
from nltk.corpus import stopwords
import statistics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem.porter import PorterStemmer
import string
import spacy
from spacy.lang.en import English
import numpy as np
import scipy.stats as stats
import math
spacy.load('en')
parser = English()

In [None]:
# load dataset
bbc_df = pd.read_csv('../input/bbc-text.csv')

In [None]:
bbc_df.head(10)

In [None]:
def wc_count(text):
    if isinstance(text,str):
        return len(text.split())
    else:
        return 0
bbc_df['word_count'] = bbc_df['text'].apply(wc_count)

In [None]:
bbc_df.columns

In [None]:
print('Size of corpus',len(bbc_df['word_count']))
print('max word count',max(bbc_df['word_count']))
print('min word count',min(bbc_df['word_count']))

In [None]:
# Category Distribution
bbc_df.groupby('category').count()

In [None]:
# Plotting the normal distribution
std = statistics.stdev(bbc_df['word_count'])
mean = statistics.mean(bbc_df['word_count'])
variance = statistics.variance(bbc_df['word_count'])
x_min = min(bbc_df['word_count'])
x_max = max(bbc_df['word_count'])
x= bbc_df['word_count']

mu = mean
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
plt.plot(x, stats.norm.pdf(x, mu, sigma))
plt.show()

In [None]:
bbc_df.sort_values('word_count',inplace = True)

In [None]:
bbc_df['word_count'].tail(15)

In [None]:
bbc_df['word_count'].head(15)

In [None]:
bbc_df = bbc_df.iloc[:-15]

In [None]:
# Plotting the Normal Distribution after removing the outliers

std = statistics.stdev(bbc_df['word_count'])
mean = statistics.mean(bbc_df['word_count'])
variance = statistics.variance(bbc_df['word_count'])
x_min = min(bbc_df['word_count'])
x_max = max(bbc_df['word_count'])
x= bbc_df['word_count']

mu = mean
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
plt.plot(x, stats.norm.pdf(x, mu, sigma))
plt.show()


In [None]:
# Round to the nearest 100th value
import math
def roundup(x):
    return int(math.ceil(x / 100.0)) * 100

bbc_df['word_count'] = bbc_df['word_count'].apply(roundup)

# plotting the histogram
score_india = bbc_df['word_count']
legend = 'Word Count'
plt.hist(score_india, color='green')
plt.xlabel("Article Size")
plt.ylabel("Frequency")
plt.legend(legend)
plt.xticks(range(0, 1500, 200))
plt.yticks(range(0, 1000, 100))
plt.title('Word count for articles in BBC')
plt.figure(figsize=(3,7))
plt.show()

In [None]:
bbc_df.shape

In [None]:
bbc_df.info()

In [None]:
bbc_df['category'].unique()

In [None]:
bbc_df['category'].value_counts()

In [None]:
sns.countplot(bbc_df['category'])