
#### In this project, I will pre-process the text and train a text classifier using different feature representation techniques.

In [3]:
import pandas as pd
import re

In [4]:
data = pd.read_csv('FPB.csv',header = None,names = ['sentiments','headlines'],encoding = 'ISO-8859-1')

In [5]:
data.head()

Unnamed: 0,sentiments,headlines
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


### Part 1: Text Pre-Processing

In [4]:
import nltk
nltk.download('punkt') # downloads a model

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
#tokenization
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [8]:
word_token = []
for i in range(len(data)):
    sentence = re.sub(r'[^\w\s]', ' ', data.iloc[i,1]) #remove alphanumeric characters
    token = word_tokenize(sentence)
    word_token.append(token)

In [9]:
%pprint

Pretty printing has been turned OFF


In [10]:
#results of tokenization for the first 5 sentences 
word_token[:5]

[['According', 'to', 'Gran', 'the', 'company', 'has', 'no', 'plans', 'to', 'move', 'all', 'production', 'to', 'Russia', 'although', 'that', 'is', 'where', 'the', 'company', 'is', 'growing'], ['Technopolis', 'plans', 'to', 'develop', 'in', 'stages', 'an', 'area', 'of', 'no', 'less', 'than', '100', '000', 'square', 'meters', 'in', 'order', 'to', 'host', 'companies', 'working', 'in', 'computer', 'technologies', 'and', 'telecommunications', 'the', 'statement', 'said'], ['The', 'international', 'electronic', 'industry', 'company', 'Elcoteq', 'has', 'laid', 'off', 'tens', 'of', 'employees', 'from', 'its', 'Tallinn', 'facility', 'contrary', 'to', 'earlier', 'layoffs', 'the', 'company', 'contracted', 'the', 'ranks', 'of', 'its', 'office', 'workers', 'the', 'daily', 'Postimees', 'reported'], ['With', 'the', 'new', 'production', 'plant', 'the', 'company', 'would', 'increase', 'its', 'capacity', 'to', 'meet', 'the', 'expected', 'increase', 'in', 'demand', 'and', 'would', 'improve', 'the', 'use', 

I used below function to process the text. I first removed alphanumeric characters, then I performed stemming of words based on PorterStemmer. I didn't remove stopwords as doing so will decrease the size of the corpus and the model performance. 

In [11]:
#results of stemming for the first 5 sentences 
nltk.download('stopwords') # <--- this is new
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.stem import PorterStemmer 

ps = PorterStemmer() 

# return a list of tokens
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences 
    sentences = re.sub(r'[^\w\s]', ' ', doc)
    sentences = sent_tokenize(sentences)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens]

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
stem_list = []
for i in range(len(data)):
    token = pre_processing_by_nltk(data.iloc[i,1])
    stem_list.append(token)

In [13]:
#results of tokenization&stemming for the first 5 sentences 
stem_list[:5]

[['accord', 'to', 'gran', 'the', 'compani', 'ha', 'no', 'plan', 'to', 'move', 'all', 'product', 'to', 'russia', 'although', 'that', 'is', 'where', 'the', 'compani', 'is', 'grow'], ['technopoli', 'plan', 'to', 'develop', 'in', 'stage', 'an', 'area', 'of', 'no', 'less', 'than', '100', '000', 'squar', 'meter', 'in', 'order', 'to', 'host', 'compani', 'work', 'in', 'comput', 'technolog', 'and', 'telecommun', 'the', 'statement', 'said'], ['the', 'intern', 'electron', 'industri', 'compani', 'elcoteq', 'ha', 'laid', 'off', 'ten', 'of', 'employe', 'from', 'it', 'tallinn', 'facil', 'contrari', 'to', 'earlier', 'layoff', 'the', 'compani', 'contract', 'the', 'rank', 'of', 'it', 'offic', 'worker', 'the', 'daili', 'postime', 'report'], ['with', 'the', 'new', 'product', 'plant', 'the', 'compani', 'would', 'increas', 'it', 'capac', 'to', 'meet', 'the', 'expect', 'increas', 'in', 'demand', 'and', 'would', 'improv', 'the', 'use', 'of', 'raw', 'materi', 'and', 'therefor', 'increas', 'the', 'product', 'pr

### Part 2: Bag Of Words (20 points)

In [15]:
#split test and train data
from sklearn.model_selection import train_test_split

# splitting the train-test sets
X_train, X_test, y_train, y_test = train_test_split(data.headlines, data.sentiments, random_state=42, test_size=0.2, shuffle=True)

In the following part, I created a word frequency dictionary based on the train dataset. Keys of the freq are words that contained in the train dataset, and values associated with each key are times that word appears in the document.

In [49]:
from collections import defaultdict
freq = defaultdict(int)


corpus = ' '.join(list(X_train))
new_corpus = re.sub(r'[^\w\s]', ' ', corpus)
raw_tokens = new_corpus.lower().split()
raw_tokens = [ps.stem(word) for word in raw_tokens]
raw_tokens_stop = [w.lower() for w in raw_tokens if w.lower() not in stop]

In [50]:
print(len(raw_tokens))
print(len(raw_tokens_stop))

82506
55014


In [17]:
# a dictionary that contains frequency of a certain word
for token in raw_tokens:
    freq[token] += 1

In [18]:
from math import log
IDF, vocab = dict(), dict()
for token in freq:
    vocab[token] = len(vocab) #create a fix index of all words
    IDF[token] = log(1 + len(X_train) / freq[token]) #


In [20]:
IDF['<UNK>'] = 1
vocab['<UNK>'] = len(vocab)

In [21]:
index_list = vocab.keys()

### Train the classifier using method 1
In this method, the feature is represented as a binary-valued vector of dimension equal to the size of the vocabulary. The value at an index is 1 if the word corresponding to that index is present in the document, else 0.
When I first train the model, I received a warning indicating the model doesn't converge. Therefore I increased the number of iteration to solve the problem.

In [26]:
def vocabEXIST(doc,index,freqdic):
    tokens = pre_processing_by_nltk(doc)
    x= []
    for vob in index:
        if vob not in tokens:
            x.append(0)
        else:
            x.append(1)
    return x

In [27]:
X_train_1 = []
X_test_1 = []
for doc in X_train: #create a feature vector 
    X_train_1.append(vocabEXIST(doc, index_list, freq))
for doc in X_test:
    X_test_1.append(vocabEXIST(doc, index_list, freq))
    

In [28]:
from sklearn.linear_model import LogisticRegression
M1 = LogisticRegression(random_state=0,max_iter=1000).fit(X_train_1,y_train)
predict_y1 = M1.predict(X_test_1)

In [30]:
import sklearn
macro_f1_1 = sklearn.metrics.f1_score(y_test, predict_y1,average='macro')
micro_f1_1 = sklearn.metrics.f1_score(y_test, predict_y1,average='micro')
y_predict_prob = M1.predict_proba(X_test_1)
auc_1 = sklearn.metrics.roc_auc_score(y_test,y_predict_prob,multi_class = 'ovr')

print('AUROC is',round(auc_1,2),', macro-f1 score is', round(macro_f1_1,2),', micro-f1 score',round(micro_f1_1,2))


AUROC is 0.89 , macro-f1 score is 0.72 , micro-f1 score 0.77


### Train the classifier using method 2
In this method, the feature is represented by a vector of dimension equal to the size of the vocabulary where the value corresponding to each word is its frequency in the document.


In [32]:
def vocabfreq(doc,index,freqdic):
    tokens = pre_processing_by_nltk(doc)
    x= []
    for vob in index:
        if vob not in tokens:
            x.append(0)
        else:
            x.append(freqdic[vob])
    return x

In [33]:
X_train_2 = []
X_test_2 = []
for doc in X_train:
    X_train_2.append(vocabfreq(doc, index_list, freq))
for doc in X_test:
    X_test_2.append(vocabfreq(doc, index_list, freq))
    

In [39]:
M2 = LogisticRegression(random_state=0,max_iter=2000).fit(X_train_2,y_train)
predict_y2 = M2.predict(X_test_2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
macro_f1_2 = sklearn.metrics.f1_score(y_test, predict_y2,average='macro')
micro_f1_2 = sklearn.metrics.f1_score(y_test, predict_y2,average='micro')
y_predict_prob = M2.predict_proba(X_test_2)
auc_2 = sklearn.metrics.roc_auc_score(y_test,y_predict_prob,multi_class = 'ovr')

print('AUROC is',round(auc_2,2),' macro-f1 score is', round(macro_f1_2,2),'micro-f1 score',round(micro_f1_2,2))


AUROC is 0.86  macro-f1 score is 0.69 micro-f1 score 0.74


### Train the classifier using method 3
In this method, the feature is represented by a vector of dimension equal to the size of the vocabulary where the value corresponding to each word is its tf-idf value.

In [42]:
# method 3
def tfidf_feature_extractor(doc, vocab, IDF):
    tokens = pre_processing_by_nltk(doc)
    for i, token in enumerate(tokens):
        if token not in vocab:
            tokens[i] = '<UNK>'
    TF = defaultdict(int)
    for token in tokens:
        TF[token] += 1
    x = [0] * len(vocab)
    for token in set(tokens):
        tfidf = log(TF[token] + 1) * IDF[token]
        token_id = vocab[token]
#         print(token, TF[token], IDF[token])
        x[token_id] = tfidf # this will be a dense matrix
    return x

In [43]:
X_train_3 = []
X_test_3 = []

for doc in X_train:
    X_train_3.append(tfidf_feature_extractor(doc, vocab, IDF))

for doc in X_test:
    X_test_3.append(tfidf_feature_extractor(doc, vocab, IDF))  

In [44]:
M3 = LogisticRegression(random_state=0).fit(X_train_3,y_train)
predict_y3 = M3.predict(X_test_3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
macro_f1_3 = sklearn.metrics.f1_score(y_test, predict_y3,average='macro')
micro_f1_3 = sklearn.metrics.f1_score(y_test, predict_y3,average='micro')
y_predict_prob = M3.predict_proba(X_test_3)
auc_3 = sklearn.metrics.roc_auc_score(y_test,y_predict_prob,multi_class = 'ovr')

print('AUROC is',round(auc_3,2),' macro-f1 score is', round(macro_f1_3,2),'micro-f1 score',round(micro_f1_3,2))


AUROC is 0.87  macro-f1 score is 0.72 micro-f1 score 0.76


In [46]:
auc = [auc_1,auc_2,auc_3]
mi_f1 = [micro_f1_1,micro_f1_2,micro_f1_3]
ma_f1 = [macro_f1_1,macro_f1_2,macro_f1_3]
sum_table = pd.DataFrame().assign(auc = auc,macro_f1 =ma_f1 ,micro_f1 =mi_f1 )

In [47]:
sum_table.index = ['Binary','Frequency','TF-IDF']

In [48]:
sum_table

Unnamed: 0,auc,macro_f1,micro_f1
Binary,0.894445,0.724276,0.768041
Frequency,0.862403,0.691797,0.738144
TF-IDF,0.87249,0.723114,0.758763


For this specific text classifier, I will choose the method 1 since it has the highest AUC, macro-f1 score, and micro-f1 score. The first model dominates the other 2 based on the three metric we choose.