#  ML with TF-IDF (movie rating)
- read and prepare training data
- split training data into train and test
- run different classifier
    - naive Bayes
    - random forests
    - support vector machine
- evaluate / compare

### Tasks:
- pre-process features in different ways:  
    - lower case, stemmer (porter, ours)
    - replace NER
    - add n-grams (bi-grams)
    - compute TF-IDF / Frequencies
    - filter most important terms /test different vector size
    - add w2v of n-best terms/document 
- analyse relevance of features
- modify parameters of classifiers
- fold cross validation (using different training/testing) sets

In [2]:
import os
import sys
import os.path
import glob
import numpy as np
import pandas as pd
import subprocess
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import os.path


import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag


In [3]:
##############################################################
# Read n files from directory and preprocess Documnt collection
# D[doc][words]
# doc: basename of file
# words: list of tokenized words

def readDocumentCollection(dic, n=0, verbose = 0) :
    D = {}
    i = 0

    # sort documents to retrieve th first n documents
    for f in sorted(Path(dic).iterdir()):
        # read n documents
        if (n > 0 and i == n): break
        i += 1
        with f.open('r', encoding='utf-8') as fhin: data = fhin.read()
            
        # get the file basename as index for document
        b = os.path.basename(f).split(".")[0]
        D.setdefault(b, [])
        
        # process document as a flat list of tokens
        D[b].extend(nltk.word_tokenize(data))
        if(verbose == 1): print(b,"\t", f.resolve())
    return D


In [4]:
## Compare reading already tokenized files

In [5]:
def readTokenizedCollection(dic) :
    D = {}

    for f in sorted(Path(dic).iterdir()):            
        with f.open('r', encoding='utf-8') as fhin: data = fhin.read()
        b = os.path.basename(f).split(".")[0]
        D[b] = data.split()
  
    return D


In [6]:
from timeit import default_timer as timer

start = timer()
DTpos = readDocumentCollection("/data/critt/shared/resources/aclImdb/test/pos/")
DTneg = readDocumentCollection("/data/critt/shared/resources/aclImdb/test/neg/")
DNpos = readDocumentCollection("/data/critt/shared/resources/aclImdb/train/pos/")
DNneg = readDocumentCollection("/data/critt/shared/resources/aclImdb/train/neg/")
end = timer()
print(end - start) # Time in seconds, e.g. 5.38091952400282



147.4545726919896


In [7]:
from timeit import default_timer as timer

start = timer()
CTpos = readTokenizedCollection("/data/critt/shared/resources/aclImdb/test/posTokenized/")
CTneg = readTokenizedCollection("/data/critt/shared/resources/aclImdb/test/negTokenized/")
CNpos = readTokenizedCollection("/data/critt/shared/resources/aclImdb/train/posTokenized/")
CNneg = readTokenizedCollection("/data/critt/shared/resources/aclImdb/train/negTokenized/")
end = timer()
print(end - start) # Time in seconds, e.g. 5.38091952400282

6.791869009000948


In [8]:
for i in DTpos: 
    if(DTpos[i] != CTpos[i]) :
        print(i)
print(len(CTpos.keys()), len(DTpos.keys()))

12500 12500


## TF-IDF
- documents collection $D: \{d_1 ... d_n\}$
- terms $t_i$

-----
Term Frequency (TF):
- TF($t,d$) = $\frac{\mbox{Number of times term $t$ appears in document $d$}} {\mbox{Total number of terms in the document $d$}}$

Inverse Document Frequncy (IDF):

- IDF($t,D$) = $log(\frac{\mbox{Number of documents $d$ in the corpus ($|d \in D|$)}} {\mbox{number of documents $d \in D$ where the term $t$ appears}})$

TF-IDF:
- TF-IDF($t, d, D$) = TF($t, d$) * IDF($t,D$)

-----
Functions:
- T = TF_IDF(D): Train TF_IDF structure for document collection D  
- W = TF_IDFvector(D, T): Return vector of tf-idf values
- L = labelVector(W, T, m=1): Return vector of labels, (m highest terms per document)
- B = bestTFIDF(D, n=10): Return union of n highest-ranked terms per document in collection
- nBestIndex(T, nBest): re-assign TF_IDF structure for nBest terms


In [9]:
# Train a TF-IDF structure
# count words in a collection of documents (D)
# where structure D[doc][word]
# return word/document dictionary: 
#   T['///---///'][doc] = number_words_in_doc
#   T[word]['D'] = occurancs_word_in_collection
#   T[word]['d'][doc]['f'] = occurances_word_in_doc
#   T[word]['d'][doc]['tf'] = term frequency : T[word]['d'][doc]['f'] / T["///--///"][doc]


def TF_IDF(D, T={}):
    for d in D: # d: document
        for w in D[d]: # w: words in document
                
            # count frequency of term w in documnt d
            T.setdefault(w, {})
            T[w].setdefault('d', {})
            T[w]['d'].setdefault(d, {})
            T[w]['d'][d].setdefault('f', 0)
            
            # increment word count for document 
            T[w]['d'][d]['f'] += 1
                
            # increment frequency of w only once per document d
            T[w].setdefault('D', 0)
            if(T[w]['d'][d]['f'] == 1):  
                T[w]['D'] += 1
                                  
            
        # count terms in document d
        T.setdefault("///d///", {})
        T["///d///"].setdefault(d, 0)
        T["///d///"][d] += len(D[d])

    # number of documents     
    T["///d///"].setdefault("#docs", 0)
    T["///d///"]["#docs"] += len(D.keys())
    
    
    # set of word indexes
    T["///d///"].setdefault("#include", set())
    
    # indexes of words
    T["///d///"].setdefault("idx", {})
       
    # compute tf-idf
    idx = 0 # index of word 
    for w in T: # d: document
        # Ignore if not a word
        if(w == "///d///"): continue
        
        # unique index of word 
        T[w]['idx'] = idx
        T["///d///"]['#include'] = T["///d///"]['#include'].union({idx})
        T["///d///"]['idx'][idx] = w
        idx +=1 
        
        # number of documents   /  
        idf = np.log(T["///d///"]["#docs"]/T[w]['D'])
        T[w]['idf'] = idf
        
        # compute tfidf
        for d in T[w]['d']: # d: document
            tf = T[w]['d'][d]['f'] / T["///d///"][d]  
            T[w]['d'][d]['tf'] = tf
            T[w]['d'][d]['tfidf'] = tf * idf
            
    # number of documents     
    T["///d///"].setdefault("#words", 0)
    T["///d///"]["#words"] = len(T["///d///"]['#include'])

    return T
 
#############################


# create tokenized word Vectors from documents in a collection  
def TFIDFvector(D, T):
    
    W = {} # dictionary of word vectors with tfidf values 
    for d in D:
        
        V = T["///d///"]["#words"] * [0] # allocate word vector, instantiate with 0
        H = {} # count frequency of words (terms) in Document
        cnt = 0 # count number of indexed terms (w) in document d
        for w in D[d]:
            if(w in T): 
                idx = T[w]['idx']
                if (idx in T["///d///"]['#include']) : 
                    H.setdefault(w, 0)
                    H[w] += 1
                    cnt += 1
        for w in H:
            idx = T[w]['idx']
            tf = H[w] / cnt          
            V[idx] = tf * T[w]['idf']
        W[d] = V
    return W


# generate m labels for word vectore 
def labelVector(W, T, m=1, verbose = 0):
    
    L = []
    for d in W:
        V = W[d]
        
        # rank Vector by value 
        R = [index for element, index in sorted(zip(V, range(len(V))), reverse=True)]

        l = ''
        for i in range(len(R)):
            r = R[i]
            if(V[r] == 0) : break
            
            # number of important keywords
            if (i == m): break
            l = f"{l}-{T['///d///'][r]}" # get the indexed word
            if(verbose) : print(f"{d}  {i}\t{float(V[r]):4.4}\t{r}\t {T['///d///'][r]}")
        L.append(f"{d}-{l}")
    return L


# print TF-IDF 
def printTfIdf(w, T) :
    print(f"{w:<8}\t#d:{T[w]['D']}\tidf:{T[w]['idf']:4.4}\tVln:{len(T.keys())}")
    for d in T[w]['d'] :
        f = T[w]['d'][d]['f']
        tf = T[w]['d'][d]['tf']
        idf = T[w]['idf']
        tfidf = T[w]['d'][d]['tfidf']

        print(f"{d:<8}\tcnt:{f}\ttf:{tf:4.4}\ttfidf:{tfidf:4.4}")              
        
                                 
#####################################################################
# create tokenized word Vectors from documents in a collection  
def bestTFIDF(D, T, n=3):
    W = set() # dictionary of word vectors with tfidf values 
    for d in D:
        V = {(i+1)*-1:"" for i in range(n)}
                  
        H = {} # count frequency of words (terms) in Document
        cnt = 0 # count number of indexed terms (w) in document d
        for w in D[d]:
            if(w in T): 
                idx = T[w]['idx']
                if (idx in T["///d///"]['#include']) : 
                    H.setdefault(w, 0)
                    H[w] += 1
                    cnt += 1
        for w in H:
            idx = T[w]['idx']
            tf = H[w] / cnt
            tfidf = tf * T[w]['idf']
            for i in sorted(V):
                if(tfidf > i): 
                    del V[i]
                    V[tfidf] = w
                    break 
        #print("d", d, V)
        for tfidf in V: W = W.union({V[tfidf]})
    return W

          
          
# create a list of joint n-highest ranking tf-idf values per document 
def nBestValues(W, T, n=10):
    
    idx = set() # set of highest tf-idf words
    for d in W:
        # rank Vector by value 
        V = W[d]
        R = [index for element, index in sorted(zip(V, range(len(V))), reverse=True)]
        
        for i in range(len(R)):
            r = R[i]
            if(V[r] == 0) : break # stop if index 
            if(i == n) : break # stop it index 
            idx = idx.union({T['///d///']['idx'][r]})   
    
    return idx

# create word Vectors from (the first) n documents in a collection  
def nBestIndex(T, nBest):
    
    idx = 0
    T["///d///"]['#include'] = set()
    T["///d///"]["#words"] = len(nBest)
    for w in nBest:
        T[w]['idx'] = idx
        T["///d///"]['#include'] = T["///d///"]['#include'].union({idx})
        idx += 1



## Read the training set

In [10]:
# number of documents
n = 1000

Dpos = readDocumentCollection("/data/critt/shared/resources/aclImdb/test/pos/", n=n)
# produce TF-IDF dictionary structure
T = TF_IDF(Dpos, T={})

# read document collection D
Dneg = readDocumentCollection("/data/critt/shared/resources/aclImdb/test/neg/", n=n)

# add TF-IDF model for pos and neg documents
T = TF_IDF(Dneg, T=T)

print(f"Number of different words: {T['///d///']['#words']} (i.e. length of TFIDF vector)")

Number of different words: 31610 (i.e. length of TFIDF vector)


### Best TF-IDF scores
reduce lengt of vectors: extract n-best TF-IDF


In [11]:
# Set of first highest-ranking words per document

posBest1 = bestTFIDF(Dpos, T, n=1)
negBest1 = bestTFIDF(Dneg, T, n=1)

best1 = posBest1.union(negBest1)
print(len(posBest1), len(negBest1), len(best1), T["///d///"]["#words"])

735 835 1530 31610


In [12]:
posBest1


{'!',
 '&',
 "'",
 "'Oppenheimer",
 "'Radio",
 "'ll",
 "'one",
 "'page",
 '*****',
 '***SPOILERS***',
 '*eye',
 '-',
 '--',
 '...',
 '.you',
 '10th',
 '12',
 '14-year-old',
 '1982',
 '3-D',
 '70s',
 '70s/early',
 '9PM',
 ';',
 '<',
 '=',
 '@',
 'AG',
 'ALi',
 'Alan',
 'America',
 'Ancken',
 'Anderson',
 'Andre',
 'Anna',
 'Antonioni',
 'Auteuil',
 'BABES',
 'BBC',
 'BEST',
 'Babette',
 'Bam',
 'Bams',
 'Bandit',
 'Barker',
 'Barry',
 'Basil',
 'Battleship',
 'Beller',
 'Bellucci',
 'Ben',
 'Bennett',
 'Benton',
 'Best',
 'Billy',
 'Blythe',
 'Bochner',
 'Bogosian',
 'Bollywood',
 'Bond',
 'Bonus',
 'Boone',
 'Bracco',
 'Bronenosets',
 'Bullet',
 'Buster',
 'CAT',
 'CKY',
 'COOLEY',
 'Cache',
 'Cassell',
 'Cassidy',
 'Chan',
 'Charlie',
 'Chatterly',
 'Chiba',
 'Chicago',
 'Chiller',
 'Chrisopher',
 'Christie',
 'Christina',
 'Christmas',
 'Château',
 'Clouds',
 'Cloverfield',
 'Coach',
 'Cobb',
 'Connecticut',
 'Constantine',
 'Cooley',
 'Cora',
 'Corby/White',
 'Coward',
 'Craig',
 'C

In [13]:
# Set of 2 highest-ranking words per document

posBest2 = bestTFIDF(Dpos, T, n=2)
negBest2 = bestTFIDF(Dneg, T, n=2)

best2 = posBest2.union(negBest2)
print(len(posBest2), len(negBest2), len(best2), T["///d///"]["#words"])

1205 1291 2410 31610


In [14]:
# Set of 3 highest-ranking words per document
posBest3 = bestTFIDF(Dpos, T, n=3)
negBest3 = bestTFIDF(Dneg, T, n=3)

best3 = posBest3.union(negBest3)
print(len(posBest3), len(negBest3), len(best3), T["///d///"]["#words"])

1603 1668 3137 31610


In [15]:
# re-index TF-IDF score
nBestIndex(T, best3)


## generate TF-IDF vectors

In [16]:
# compute TFIDF vectors
pos1 = TFIDFvector(Dpos, T)
neg1 = TFIDFvector(Dneg, T)

print(f"{len(pos1.keys())}, {len(neg1.keys())}, {len(pos1['0_10'])}, {T['///d///']['#words']}")

1000, 1000, 3137, 3137


In [17]:
pos2 = TFIDFvector(Dpos, T)
neg2 = TFIDFvector(Dneg, T)


## merge pos and neg datasets

In [18]:
# create training set for pos
TrainVecPos = pd.DataFrame([pos1[d] for d in pos1])
TrainVecPos["Label"] = 1
TrainVecPos["Doc"] = [d for d in Dpos]

# create training set for neg
TrainVecNeg = pd.DataFrame([neg1[d] for d in neg1])
TrainVecNeg["Label"] = 0
TrainVecNeg["Doc"] = [d for d in Dneg]

# merge dataset
TrainVecSet2 = pd.concat([TrainVecPos, TrainVecNeg], axis=0)

TrainVecSet2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3129,3130,3131,3132,3133,3134,3135,3136,Label,Doc
0,0.011778,0.017551,0.001028,0.0,0.001,0.0,0.014692,0.0,0.010347,0.009613,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0_10
1,0.001425,0.0,0.001088,0.0,0.002118,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10000_7
2,0.001975,0.0,0.000302,0.0,0.000587,0.0,0.0,0.0,0.0,0.016922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10001_9
3,0.005504,0.0,0.001121,0.0,0.004362,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10002_8
4,0.0,0.0,0.00094,0.0,0.000732,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10003_8


In [19]:
TrainVecSet2.shape

(2000, 3139)

## prepare training and test set

In [20]:
# extracting training and test set 
from sklearn.model_selection import train_test_split

Y = TrainVecSet2[['Label', 'Doc']]
X = TrainVecSet2.drop(['Label', 'Doc'], 1)

trainX, testX, trainY, testY = train_test_split(X,Y, test_size = .25)

print(trainX.shape, testX.shape, testY.shape)
trainX.head()

(1500, 3137) (500, 3137) (500, 2)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3127,3128,3129,3130,3131,3132,3133,3134,3135,3136
427,0.009558,0.0,0.001095,0.0,0.004261,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
483,0.005924,0.0,0.001206,0.0,0.002347,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,0.00325,0.0,0.000745,0.0,0.000966,0.0,0.014188,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
184,0.0,0.0,0.001551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,0.007056,0.0,0.001832,0.0,0.0,0.0,0.0,0.0,0.004339,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Naive Bayes Classifier 

In [21]:
#Import Gaussian Naive Bayes model
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(trainX, trainY["Label"])

#Predict Output
Y_Bayes = model.predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_Bayes, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_Bayes))


              precision    recall  f1-score   support

         Neg       0.70      0.68      0.69       253
         Pos       0.68      0.71      0.69       247

    accuracy                           0.69       500
   macro avg       0.69      0.69      0.69       500
weighted avg       0.69      0.69      0.69       500

Confusion Matrix:
 [[171  82]
 [ 72 175]]


# Random Forest

In [22]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(trainX, trainY["Label"]);

In [23]:
# Use the forest's predict method on the test data
Y_rf = rf.predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_rf, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_rf))


              precision    recall  f1-score   support

         Neg       0.83      0.89      0.86       253
         Pos       0.88      0.81      0.85       247

    accuracy                           0.85       500
   macro avg       0.86      0.85      0.85       500
weighted avg       0.86      0.85      0.85       500

Confusion Matrix:
 [[226  27]
 [ 46 201]]


# Support Vector Machine (SVM)

In [24]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
#clf = svm.SVC(kernel='linear') # Linear Kernel
clf = svm.SVC(decision_function_shape='ovo')
#clf = svm.LinearSVC()

#Train the model using the training sets
clf.fit(trainX, trainY["Label"])

#Predict the response for test dataset
Y_clf = clf.predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_clf, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_clf))


              precision    recall  f1-score   support

         Neg       0.85      0.84      0.85       253
         Pos       0.84      0.85      0.85       247

    accuracy                           0.85       500
   macro avg       0.85      0.85      0.85       500
weighted avg       0.85      0.85      0.85       500

Confusion Matrix:
 [[212  41]
 [ 36 211]]


# K-means

In [36]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=2, init = 'random', n_init = 10, max_iter = 300, tol = 1e-04, random_state = 0)

k_means.fit(trainX, trainY["Label"])

#Predict the response for test dataset
Y_kmeans = k_means.predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_kmeans, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_kmeans))




              precision    recall  f1-score   support

         Neg       0.58      0.03      0.05       253
         Pos       0.50      0.98      0.66       247

    accuracy                           0.50       500
   macro avg       0.54      0.50      0.36       500
weighted avg       0.54      0.50      0.35       500

Confusion Matrix:
 [[  7 246]
 [  5 242]]


# Decision Tree

In [43]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state = 100)

dtree.fit(trainX, trainY["Label"])

#Predict the response for test dataset
Y_dtree = dtree.predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_dtree, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_dtree))




              precision    recall  f1-score   support

         Neg       0.68      0.68      0.68       253
         Pos       0.67      0.68      0.67       247

    accuracy                           0.68       500
   macro avg       0.68      0.68      0.68       500
weighted avg       0.68      0.68      0.68       500

Confusion Matrix:
 [[172  81]
 [ 80 167]]


# SGD

In [44]:
from sklearn.linear_model import SGDClassifier
SGD = SGDClassifier(max_iter=10000)
SGD.fit(trainX, trainY["Label"])

#Predict the response for test dataset
Y_SGD = SGD.predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_SGD, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_SGD))       

              precision    recall  f1-score   support

         Neg       0.89      0.76      0.82       253
         Pos       0.79      0.91      0.84       247

    accuracy                           0.83       500
   macro avg       0.84      0.83      0.83       500
weighted avg       0.84      0.83      0.83       500

Confusion Matrix:
 [[193  60]
 [ 23 224]]


# DBSCAN

In [46]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.6, min_samples=8, metric='euclidean')
#dbscan.fit(trainX, trainY["Label"])

#Predict the response for test dataset
Y_dbscan = dbscan.fit_predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_dbscan, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_dbscan))  

              precision    recall  f1-score   support

         Neg       0.51      1.00      0.67       253
         Pos       0.00      0.00      0.00       247

    accuracy                           0.51       500
   macro avg       0.25      0.50      0.34       500
weighted avg       0.26      0.51      0.34       500

Confusion Matrix:
 [[253   0]
 [247   0]]


  _warn_prf(average, modifier, msg_start, len(result))


# KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(trainX, trainY["Label"])

#Predict the response for test dataset
Y_knn = knn.predict(testX)

target_names = ['Neg', 'Pos']
print(classification_report(testY["Label"], Y_knn, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_true=testY["Label"], y_pred=Y_knn))       

              precision    recall  f1-score   support

         Neg       0.56      1.00      0.71       253
         Pos       0.98      0.19      0.32       247

    accuracy                           0.60       500
   macro avg       0.77      0.59      0.52       500
weighted avg       0.77      0.60      0.52       500

Confusion Matrix:
 [[252   1]
 [200  47]]
