# Count Vector Implementation for Text Classification

In [3]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [4]:
import os
import glob
import errno
import json
import numpy as np
import pandas as pd
import nltk

filepath = '../data/bbc/'
labels = []

In [5]:
# Accessing the labels names and saving then in "labels_names"
files = os.listdir(filepath)
for name in files:
    if os.path.isdir(filepath+name): 
        labels.append(name)
        
text = []   
texts_aux = [] 
texts_labels = [] 

In [6]:
# Extracting the content of each .txt document and label the content of each article.
for label in labels:
    path = filepath+label+'/*.txt'
    files = glob.glob(path)
    for name in files:
        try:
            with open(name, 'r',encoding='ISO-8859-1') as f:
                texts_aux.append(f.read())
                texts_aux.append(label)
        except IOError as exc:
            if exc.errno != errno.EISDIR:
                raise
        texts_labels.append(texts_aux)
        texts_aux=[]

In [7]:
# Labeled texts stored in numpy array
texts_labels_np = np.array(texts_labels)

In [8]:
# Labeled texts stored in pandas dataframe
df = pd.DataFrame(texts_labels, columns=['text','label'])

### Data Preparation

In [9]:
# First step: Tokenize each text
from nltk.tokenize import RegexpTokenizer

## Load library for removing stopwords
from nltk.corpus import stopwords
##nltk.download('stopwords') --> First time has to be downloaded

# Import libraries for stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
stemmer_ps = PorterStemmer()

from nltk.stem.cistem import Cistem
stemmer_cs = Cistem()

# Import lemmatization libraries
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
#nltk.download('wordnet')--> First time has to be downloaded 

# Load stop words 
stop_words = stopwords.words('english')
#print(stop_words[:5])

In [10]:
# create tokenizer and process the raw text
tokenizer = RegexpTokenizer(r'\w+')
texts_clean = []
texts_aux = []
aux = []

for article in texts_labels_np:
        # Text to lower case
        text = article[0].lower()
        # Tokenize and Remove punctuation
        tokens = tokenizer.tokenize(text)
        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]
        # Stemming
        for token in tokens:
                aux.append(stemmer_cs.stem(token))
        tokens = aux
        
        texts_aux.append(tokens)
        texts_aux.append(article[1])
        texts_clean.append(texts_aux)
        texts_aux = []
        aux=[]

### Data Embedding

In [11]:
# Transforming labels into numbers [business, entertainment, politics, sport, tech] -- [0,1,2,3,4]
for text in texts_clean:
        if text[1]=='business':
                text[1]=0
        if text[1]=='entertainment':
                text[1]=1
        if text[1]=='politics':
                text[1]=2
        if text[1]=='sport':
                text[1]=3
        if text[1]=='tech':
                text[1]=4

text_clean_np = np.array(texts_clean)
text_clean_pd = pd.DataFrame(texts_labels, columns=['text','label'])

tokenized_texts = []
labels = []
for article in texts_clean:
        tokenized_texts.append(article[0])
        labels.append(article[1])

In [12]:
#x = np.load('../data/features/tokenized.npy')
x = np.array(tokenized_texts)
#y = np.load('../data/features/labels.npy')
y = np.array(labels)

### Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy_fun(doc):
    return doc

countvect = CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None, 
    min_df=2,
    max_df=0.5,
    ngram_range=(1,1))

X = countvect.fit_transform(x)
Y = y

print ("no of features extracted:", X.shape[1])

no of features extracted: 14850


### Split data into training, validation and test set and prepare them for text classification

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

print ("train size:", X_train.shape)
print ("validation size:", X_val.shape)
print ("test size:", X_test.shape)
print ("class distribution in training set:", pd.Series(y_train).value_counts())
print ("class distribution in validation set:", pd.Series(y_val).value_counts())
print ("class distribution in test set:", pd.Series(y_test).value_counts())

train size: (1089, 14850)
validation size: (468, 14850)
test size: (668, 14850)
class distribution in training set: 3    250
0    250
2    204
4    196
1    189
dtype: int64
class distribution in validation set: 3    107
0    107
2     88
4     85
1     81
dtype: int64
class distribution in test set: 3    154
0    153
2    125
4    120
1    116
dtype: int64


### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [16]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.9700854700854701
Accuracy for C=0.05: 0.967948717948718
Accuracy for C=0.25: 0.967948717948718
Accuracy for C=0.5: 0.967948717948718
Accuracy for C=1: 0.967948717948718


In [17]:
model = LogisticRegression(C=0.01)
model.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, model.predict(X_test)))

Final Accuracy: 0.9730538922155688


In [18]:
c_mat = confusion_matrix(y_test,model.predict(X_test))
print ("Confusion Matrix:\n", c_mat)

Confusion Matrix:
 [[147   0   4   1   1]
 [  0 114   2   0   0]
 [  5   1 118   0   1]
 [  1   0   0 153   0]
 [  1   0   0   1 118]]


### SVM

In [19]:
from sklearn.svm import LinearSVC

In [20]:
for c in [0.001, 0.005, 0.01, 0.05, 0.1]:    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.001: 0.9658119658119658
Accuracy for C=0.005: 0.9658119658119658
Accuracy for C=0.01: 0.9658119658119658
Accuracy for C=0.05: 0.9615384615384616
Accuracy for C=0.1: 0.9615384615384616


In [21]:
model = LinearSVC(C=0.01)
model.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, model.predict(X_test)))

Final Accuracy: 0.9775449101796407


In [22]:
c_mat = confusion_matrix(y_test,model.predict(X_test))
print ("Confusion Matrix:\n", c_mat)

Confusion Matrix:
 [[151   0   2   0   0]
 [  0 114   2   0   0]
 [  5   1 118   0   1]
 [  1   0   0 153   0]
 [  2   0   0   1 117]]


### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
model = RandomForestClassifier(n_estimators=300, max_depth=150,n_jobs=1)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
y_pred = model.predict(X_test)
c_mat = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)

print ("Confusion Matrix:\n", c_mat)
print ("\nAccuracy: ",acc)

Confusion Matrix:
 [[150   0   3   0   0]
 [  3 112   1   0   0]
 [  9   0 115   0   1]
 [  0   0   0 154   0]
 [  6   3   1   0 110]]

Accuracy:  0.9595808383233533
