<h1>Text Classification</h1>

Classifying articles based on <i>'usefulness'</i> to the domain

In [119]:
#Loading required packages
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import naive_bayes, linear_model, svm

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

In [120]:
#Loading the dataset
#Target variable is labelled with 1 as 'Useful' and 0 as 'Not Useful'

data = pd.read_excel("Data - Text Classification.xlsx", usecols=[0,1,3,5])
data.set_index('number', inplace=True)
data.head()

Unnamed: 0_level_0,title,content,label
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Article 1,Pier 1 appoints interim CFO amid growing finan...,Pier on Wednesday reported that fourth quarter...,1
Article 2,Family Dollar to close nearly 400 stores,Dollar Tree on Wednesday announced that up to ...,0
Article 3,Having to share personal data turns consumers ...,According to a new Harris Poll survey of Ameri...,1
Article 4,Walgreens taps Narvar for online pickup return...,Customer experience platform Narvar and Walgre...,0
Article 5,TechStyle claims more than 5M active members,TechStyle Fashion Group which operates ShoeDaz...,1


In [121]:
import spacy
nlp = spacy.load('en')
from spacy.lang.en.stop_words import STOP_WORDS
nlp.Defaults.stop_words |= {"company", "companies", "companys"}
data['tokenized_content'] = data['content'].apply(nlp)

In [122]:
tokens = []

for doc in nlp.pipe(data['tokenized_content'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        tokens.append([n.lemma_ for n in doc if n.is_stop == False])      
    else:
        tokens.append(None)

data['tokenized_content'] = tokens
data['tokenized_content'] = data['tokenized_content'].apply(','.join)

In [123]:
#Split the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(data.drop("label", axis = 1), data['label'], 
                                                    test_size = 0.15, random_state = 99)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(47, 3)
(47,)
(9, 3)
(9,)


# Feature Engineering

### 1. CountVectorizer

In [124]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
cv = CountVectorizer(lowercase=False, stop_words='english')
cv.fit(X_train['tokenized_content'])

X_train_count = cv.transform(X_train['tokenized_content'])
X_test_count = cv.transform(X_test['tokenized_content'])

### 2. TF-IDF Vectorizer

In [125]:
#using TFIDF vectorizer
tfidf = TfidfVectorizer(lowercase=False, ngram_range=(1,2))
tfidf.fit(X_train['tokenized_content'])
                        
X_train_tfidf = tfidf.transform(X_train['tokenized_content'])
X_test_tfidf = tfidf.transform(X_test['tokenized_content'])

### 3. Word Embeddings

In [126]:
vocabulary_size = 10000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(X_train['tokenized_content'])

train_sequences = pad_sequences(tokenizer.texts_to_sequences(X_train['tokenized_content']), maxlen=50)
test_sequences = pad_sequences(tokenizer.texts_to_sequences(X_test['tokenized_content']), maxlen=50)

In [127]:
embeddings_index = dict()
f = open('GloVe/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()

In [128]:
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

# Model Building

In [129]:
def model(classifier, train_features, train_label, test_features):
    # fit the training dataset on the classifier
    classifier.fit(train_features, train_label)
    
    # predict the labels on test dataset
    predictions = classifier.predict(test_features)

    return predictions, accuracy_score(predictions, y_test)

### 1. Naive Bayes Classifier

In [130]:
prediction, accuracy = model(naive_bayes.MultinomialNB(), X_train_count, y_train, X_test_count)
print("Naive Bayes using CountVectorizer - Prediction:", prediction)
print("Naive Bayes using CountVectorizer- Accuracy:", round(accuracy,4))

prediction, accuracy = model(naive_bayes.MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print("Naive Bayes using TF-IDF Vectorizer - Prediction:", prediction)
print("Naive Bayes using TF-IDF Vectorizer- Accuracy:", round(accuracy,4))

Naive Bayes using CountVectorizer - Prediction: [0 1 0 0 0 1 0 1 1]
Naive Bayes using CountVectorizer- Accuracy: 0.8889
Naive Bayes using TF-IDF Vectorizer - Prediction: [1 1 1 1 1 1 1 1 1]
Naive Bayes using TF-IDF Vectorizer- Accuracy: 0.5556


In [131]:
y_test

number
Article 50    0
Article 17    1
Article 8     0
Article 42    0
Article 7     1
Article 44    1
Article 31    0
Article 15    1
Article 52    1
Name: label, dtype: int64

### 2. Logistic Regression

In [132]:
prediction, accuracy = model(linear_model.LogisticRegression(), X_train_count, y_train, X_test_count)
print("Logistic Regression using CountVectorizer - Prediction:", prediction)
print("Logistic Regression using CountVectorizer - Accuracy:", round(accuracy,4))

prediction, accuracy = model(linear_model.LogisticRegression(), X_train_tfidf, y_train, X_test_tfidf)
print("Logistic Regression using TF-IDF Vectorizer - Prediction:", prediction)
print("Logistic Regression using TF-IDF Vectorizer- Accuracy:", round(accuracy,4))

Logistic Regression using CountVectorizer - Prediction: [0 0 0 0 0 1 1 1 1]
Logistic Regression using CountVectorizer - Accuracy: 0.6667
Logistic Regression using TF-IDF Vectorizer - Prediction: [1 1 1 0 0 1 0 1 1]
Logistic Regression using TF-IDF Vectorizer- Accuracy: 0.6667


### 3. Support Vector Classifier(SVC)

In [133]:
prediction, accuracy = model(svm.SVC(), X_train_count, y_train, X_test_count)
print("SVC using CountVectorizer - Prediction:", prediction)
print("SVC using CountVectorizer - Accuracy:", round(accuracy,4))

prediction, accuracy = model(svm.SVC(), X_train_tfidf, y_train, X_test_tfidf)
print("SVC using TF-IDF Vectorizer - Prediction:", prediction)
print("SVC using TF-IDF Vectorizer- Accuracy:", round(accuracy,4))

SVC using CountVectorizer - Prediction: [1 1 1 1 1 1 1 1 1]
SVC using CountVectorizer - Accuracy: 0.5556
SVC using TF-IDF Vectorizer - Prediction: [1 1 1 1 1 1 1 1 1]
SVC using TF-IDF Vectorizer- Accuracy: 0.5556


# Model Building using Deep Learning

### 4. Recurrent Neural Network - LSTM

In [134]:
model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation="sigmoid"))

model_lstm.compile(loss = "binary_crossentropy", optimizer='adam', metrics=['accuracy'])

In [135]:
model_lstm.fit(train_sequences, y_train, validation_data=(test_sequences, y_test), epochs=5)

Train on 47 samples, validate on 9 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x283ecd94550>

### 5. LSTM with Convolutional Neural Network

In [136]:
model_conv = Sequential()
model_conv.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(64, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(LSTM(100))
model_conv.add(Dense(1, activation='sigmoid'))

model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [137]:
model_conv.fit(train_sequences, y_train, validation_data=(test_sequences, y_test), epochs = 5)

Train on 47 samples, validate on 9 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x283ecdc4b00>

<b>Naive Bayes with CountVectorizer has better accuracy compared to other models</b>