In [12]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

### Traditional machine learning models on 20 news group data

I trained the data on 3 variants and I discussed the results in report doc:  

1) bag of words as tokenizer and logistic regression 

2) TF-IDF tokenizer and Logistic regression 

3) TF-IDF tokenizer and SVM.

### Read data

In [3]:
# Path to documents.
dir_path = os.getcwd()
rel_path = "data"
data_path = os.path.join(dir_path, rel_path)
news_groups = [f for f in os.listdir(data_path)]

In [4]:
# Preprocess data: Remove all special characters, convert to lower case.
def data_preprocess(cur):
        cur = cur.lower() # convert to lower case
        cur = re.sub(r'[\w\.-]+@[\w\.-]+',' ',cur) # remove email ids
        cur = re.sub("[^a-zA-Z,.']", ' ', cur) # remove special characters and numbers
        cur = re.sub('\.\.+', ' ', cur) # remove unnecessary periods.
        cur = " ".join(cur.split())
        return cur

In [14]:
# Prepare dataset: Read data from docs. 
X = []
y = []

category_indexes = {v:i for i,v in enumerate(news_groups)}

for category in news_groups:
    cls = []
    f_path = os.path.join(data_path,category)
    # Read data from all docs.
    for files in os.listdir(f_path):
        text = ''
        path = os.path.join(f_path,files)
        with open(path,'r',errors='ignore',encoding="utf8") as file:
            cur_doc = data_preprocess(file.read().replace('\n',' '))
            X.append(cur_doc)
        y.append(int(category_indexes[category]))


In [15]:
# train test split
X_train,X_test,y_train,y_test= train_test_split(X,y,stratify=y,test_size=0.2, random_state=9)
print('number of training samples:', len(X_train))
print('number of test samples:', len(X_test))

number of training samples: 16333
number of test samples: 4084


In [16]:
# make train and test dataframes
train_df = pd.DataFrame({'doc':X_train,
                         'labels':y_train})
test_df = pd.DataFrame({'doc':X_test,
                         'labels':y_test})

### Model 1: Bag of Words + Logistic regression

Let's train a model using a simple count vectorizer. The bag-of-words model is a simplified representation of the raw data. In this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words. Bag-of-words representations discard grammar, order, and structure in the text, but track occurances. To use this model, Sklearn offers a `CountVectorizer` class which returns a sparse matrix and basically does the same specified before, but which has some configurable options. 

#### Transformation and word frequencies

In [19]:
# Setting the vectorizer just like we would set a model

cvec = CountVectorizer()

# Fitting the vectorizer on our training data

cvec.fit(X_train)

CountVectorizer()

In [20]:
# Lets check the length of our data that is in a vectorized state

len(cvec.get_feature_names())

78380

One of the features included by CountVectorizer is the in-built tool of stop_words. In NLP, stop words are words which are filtered out usually after processing our data. Though stop_words in this case refers to the most common words in a language (and more often than not, this is the case). there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list.

In [22]:
# Lets use the stop_words argument to remove words like "and, the, a"

cvec = CountVectorizer(stop_words='english')

# Fit our vectorizer using our train data

cvec.fit(X_train)

# Transform training data

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [25]:
X_test_cvec = cvec.transform(X_test)

#### Fit a logistic regression model

In [29]:
lr = LogisticRegression(solver='lbfgs', multi_class='ovr',max_iter=1000)
lr.fit(X_train_cvec, y_train)
print(lr.score(X_train_cvec, y_train))
print(lr.score(X_test_cvec, y_test))

0.9674891324312741
0.821743388834476


In [30]:

def docm(y_true, y_pred,labels):
    cm = confusion_matrix(y_true, y_pred)
    cols = ['p_'+c for c in labels]
    df = pd.DataFrame(cm, index=labels, columns=cols)
    return df

In [31]:
docm(y_test,lr.predict(X_test_cvec),labels=news_groups)

Unnamed: 0,p_alt.atheism,p_comp.graphics,p_comp.os.ms-windows.misc,p_comp.sys.ibm.pc.hardware,p_comp.sys.mac.hardware,p_comp.windows.x,p_misc.forsale,p_rec.autos,p_rec.motorcycles,p_rec.sport.baseball,p_rec.sport.hockey,p_sci.crypt,p_sci.electronics,p_sci.med,p_sci.space,p_soc.religion.christian,p_talk.politics.guns,p_talk.politics.mideast,p_talk.politics.misc,p_talk.religion.misc
alt.atheism,140,0,0,0,1,0,1,0,0,1,0,0,0,2,1,6,1,3,2,45
comp.graphics,0,161,10,6,2,7,5,1,0,1,0,1,3,2,1,1,0,0,1,0
comp.os.ms-windows.misc,0,9,159,9,7,8,4,0,0,0,0,0,2,0,1,0,0,0,1,0
comp.sys.ibm.pc.hardware,0,11,11,152,9,3,5,2,1,1,0,1,6,0,1,0,0,0,0,0
comp.sys.mac.hardware,0,4,4,12,164,0,7,0,1,1,0,1,6,0,0,0,0,0,0,0
comp.windows.x,0,11,11,6,0,167,4,0,0,0,0,0,2,0,0,0,0,0,0,0
misc.forsale,0,7,2,3,4,0,178,3,0,0,0,1,1,3,0,0,0,0,0,0
rec.autos,1,2,2,1,0,0,7,175,4,1,0,0,4,0,2,0,0,0,1,1
rec.motorcycles,0,1,1,1,1,0,10,1,180,1,1,0,0,0,0,1,2,0,0,0
rec.sport.baseball,1,1,0,0,1,0,2,1,0,189,2,0,0,0,0,3,0,0,0,0


Classification report of cbow and logistic regression model

In [32]:
print(classification_report(y_test,lr.predict(X_test_cvec),target_names=news_groups))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.69      0.71       203
           comp.graphics       0.71      0.80      0.75       202
 comp.os.ms-windows.misc       0.77      0.80      0.78       200
comp.sys.ibm.pc.hardware       0.75      0.75      0.75       203
   comp.sys.mac.hardware       0.84      0.82      0.83       200
          comp.windows.x       0.87      0.83      0.85       201
            misc.forsale       0.71      0.88      0.79       202
               rec.autos       0.93      0.87      0.90       201
         rec.motorcycles       0.96      0.90      0.93       200
      rec.sport.baseball       0.92      0.94      0.93       200
        rec.sport.hockey       0.97      0.95      0.96       200
               sci.crypt       0.92      0.89      0.90       200
         sci.electronics       0.85      0.81      0.83       202
                 sci.med       0.90      0.94      0.92       200
         

### Model 2:  tf-idf + logistic regression


A tf-idf score tells us which words are most discriminating between documents. Words that occur a lot in one document but don't occur in many documents contain a great deal of discriminating power.

- This weight is a statistical measure used to evaluate how important a word is to a document in a collection (aka corpus).


- The importance increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.

The inverse document frequency is a measure of how much information the word provides, that is, whether the term is common or rare across all documents. It is the logarithmically scaled inverse fraction of the documents that contain the word plus one, obtained by dividing the total number of documents by the number of documents containing the term plus one, and then taking the logarithm of that quotient.

This enhances terms that are highly specific of a particular document, while suppressing terms that are common to most documents.


Term frequency `tf` is the frequency of a certain term in a document:

$$
\mathrm{tf}(t,d) = N_\text{term}
$$

Inverse document frequency `idf` is defined as the frequency of documents that contain that term over the whole corpus (logarithmically scaled and adjusted to give only positive results greater or equal to one):

$$
\mathrm{idf}(t, D) = 1+\log\left(\frac{1+N_\text{Documents}}{1+N_\text{Documents that contain term}}\right)
$$

Term frequency - Inverse Document Frequency (`tf-idf`) is calculated as:

$$
\text{tf-idf}(t,d,D) = \mathrm{tf}(t,d) \cdot \mathrm{idf}(t, D)
$$

Usually the obtained numbers are then rescaled in such a way that the tf-idf vector of each document has Euclidean length one.

To apply this method in an easy way, we're gonna use again another Sklearn tool: TfidfVectorizer

In [36]:
piped_model_2 = make_pipeline(TfidfVectorizer(stop_words='english',
                                      #sublinear_tf=True,
                                      max_df=0.3,
                                      max_features=4000),
                      LogisticRegression())
piped_model_2.fit(X_train, y_train)
y_pred = piped_model_2.predict(X_test)
print(accuracy_score(y_test, y_pred))
print("Number of features:", len(piped_model_2.steps[0][1].get_feature_names()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7904015670910872
Number of features: 4000


Classification report of tf-idf + logistic regression model

In [37]:
print(classification_report(y_test,y_pred,target_names=news_groups))

                          precision    recall  f1-score   support

             alt.atheism       0.67      0.61      0.64       203
           comp.graphics       0.66      0.72      0.69       202
 comp.os.ms-windows.misc       0.72      0.70      0.71       200
comp.sys.ibm.pc.hardware       0.74      0.73      0.73       203
   comp.sys.mac.hardware       0.85      0.77      0.81       200
          comp.windows.x       0.82      0.82      0.82       201
            misc.forsale       0.73      0.84      0.78       202
               rec.autos       0.88      0.87      0.87       201
         rec.motorcycles       0.93      0.88      0.90       200
      rec.sport.baseball       0.90      0.90      0.90       200
        rec.sport.hockey       0.96      0.94      0.95       200
               sci.crypt       0.93      0.86      0.89       200
         sci.electronics       0.74      0.75      0.74       202
                 sci.med       0.89      0.90      0.89       200
         

### Model 3: tf-df + SVM

In [39]:
tvec = TfidfVectorizer(stop_words='english',
                       max_df=0.5,
                       max_features=1000)

In [40]:
tvec.fit(X_train)
X_train_tvec = tvec.transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [59]:
# Parameter tuning
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'linear']}
grid = GridSearchCV(SVC(),param_grid)
grid.fit(X_train_tvec,y_train)
print(grid.best_estimator_)

SVC(C=0.1, gamma=0.01)


In [63]:
# Best fit
model = SVC(gamma=0.01,kernel='rbf',C=0.1)
model.fit(X_train_tvec, y_train)
y_pred = model.predict(X_test_tvec)
score = accuracy_score(y_test, y_pred)
score

0.732615083251714

In [43]:
cm = docm(y_test, y_pred, news_groups)
cm

Unnamed: 0,p_alt.atheism,p_comp.graphics,p_comp.os.ms-windows.misc,p_comp.sys.ibm.pc.hardware,p_comp.sys.mac.hardware,p_comp.windows.x,p_misc.forsale,p_rec.autos,p_rec.motorcycles,p_rec.sport.baseball,p_rec.sport.hockey,p_sci.crypt,p_sci.electronics,p_sci.med,p_sci.space,p_soc.religion.christian,p_talk.politics.guns,p_talk.politics.mideast,p_talk.politics.misc,p_talk.religion.misc
alt.atheism,117,0,1,0,0,0,0,0,2,1,0,0,0,3,1,11,2,2,10,53
comp.graphics,0,137,16,10,0,10,3,2,0,0,0,1,13,6,3,0,0,0,0,1
comp.os.ms-windows.misc,1,17,142,11,3,7,7,1,0,0,0,0,5,3,3,0,0,0,0,0
comp.sys.ibm.pc.hardware,0,9,14,140,16,4,5,0,0,2,0,0,9,4,0,0,0,0,0,0
comp.sys.mac.hardware,0,7,5,27,136,1,8,2,0,0,0,0,10,4,0,0,0,0,0,0
comp.windows.x,2,20,7,4,3,147,5,0,0,0,0,1,5,2,1,1,0,1,1,1
misc.forsale,1,6,2,8,8,0,163,3,0,1,2,0,4,2,0,0,1,0,0,1
rec.autos,2,2,1,1,1,2,6,160,7,2,0,0,5,7,3,0,0,0,2,0
rec.motorcycles,2,1,0,0,1,2,8,9,154,2,0,0,5,8,1,0,2,0,4,1
rec.sport.baseball,4,2,1,0,1,1,2,1,3,155,17,1,3,4,0,1,1,0,2,1


Classification report of tf-idf + SVM model

In [61]:
print(classification_report(y_test,y_pred,target_names=news_groups))

                          precision    recall  f1-score   support

             alt.atheism       0.58      0.58      0.58       203
           comp.graphics       0.60      0.68      0.63       202
 comp.os.ms-windows.misc       0.74      0.71      0.72       200
comp.sys.ibm.pc.hardware       0.64      0.69      0.66       203
   comp.sys.mac.hardware       0.76      0.68      0.72       200
          comp.windows.x       0.79      0.73      0.76       201
            misc.forsale       0.70      0.81      0.75       202
               rec.autos       0.83      0.80      0.81       201
         rec.motorcycles       0.87      0.77      0.82       200
      rec.sport.baseball       0.82      0.78      0.79       200
        rec.sport.hockey       0.89      0.82      0.85       200
               sci.crypt       0.94      0.77      0.85       200
         sci.electronics       0.65      0.69      0.67       202
                 sci.med       0.71      0.87      0.78       200
         