<a href="https://colab.research.google.com/github/praveenpareek11/task_loktra/blob/master/document_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Dependencies

In [0]:
import re
import os
import string
import Stemmer
import pickle
import numpy as np

# stopwords
from nltk.corpus import stopwords

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

### Text Pre-processing

In [0]:
stemmer = Stemmer.Stemmer('english')
stopWords = stopwords.words('english')

bullet_set = set(['*', '-', '\u2022', '\u2023', '\u25e6', '\u2043', '\u204c', '\u204d', '\u2219'])
punctuation_set = set(string.punctuation)
stopChar = set(list(bullet_set)+list(punctuation_set))
stopCharStr = "".join(stopChar)
translator = str.maketrans(stopCharStr, ' '*len(stopCharStr))

In [0]:
def clean_text_tokenizer(text):
    # remove line break
    text = text.replace("\n", " ").lower()
    # remove non-ascii char
    text = "".join([i if ord(i) < 128 else " " for i in text])
    # remove bullets and punctuation
    text = text.translate(translator)
    # remove mupltiple space 
    text = re.sub(" +", " ", text)
    # remove stopwords
    words = [word for word in text.split() if word not in stopWords]
    # stemming
    return stemmer.stemWords(words)

### Read Training Data

In [0]:
data = open("trainingdata.txt").read().split("\n")[1:-1]

In [0]:
label_list = [i[0] for i in data]
text_list = [i[2:].strip() for i in data]

In [0]:
assert len(label_list)==len(text_list)

In [0]:
print("Total Documents: {}".format(len(text_list)))

Total Documents: 5485


### TF-IDF Vectorizer

In [0]:
# experiment with vectorizer parameters
vectorizer = TfidfVectorizer(tokenizer=clean_text_tokenizer, 
                             ngram_range=(1,3), 
                             min_df=0.02, 
                             max_df=0.98)

In [0]:
X = vectorizer.fit_transform(text_list)

In [0]:
print("Matrix Shape: {}".format(str(X.shape)))

Matrix Shape: (5485, 691)


In [0]:
y = np.array(label_list)

### Train Valid Split

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### Define Gradient Boosting Classifier

In [0]:
# experiement with different classifiers
gbc = GradientBoostingClassifier(verbose=1)

In [0]:
gbc.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1        4083.7320           31.18s
         2        3534.7335           31.36s
         3        3121.7482           31.06s
         4        2784.3658           30.84s
         5        2478.2923           30.53s
         6        2249.8527           30.26s
         7        2041.1772           29.93s
         8        1867.1168           29.58s
         9        1717.6471           29.26s
        10        1578.3760           28.92s
        20         829.6136           25.68s
        30         555.3787           22.45s
        40         414.3608           19.15s
        50         335.8983           15.94s
        60         283.2168           12.72s
        70         244.1038            9.52s
        80         215.3740            6.34s
        90         189.8999            3.17s
       100         168.7094            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

### Model Evalution

In [0]:
gbc.score(X_valid, y_valid)

0.9361896080218779

##### Classification Report on Validation Data

In [0]:
y_pred = gbc.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           1       0.98      0.96      0.97       556
           2       0.89      0.98      0.93       323
           3       0.96      0.91      0.93        54
           4       0.78      0.70      0.74        20
           5       0.50      0.22      0.31         9
           6       0.93      0.81      0.87        53
           7       0.91      0.82      0.86        38
           8       0.88      0.86      0.87        44

    accuracy                           0.94      1097
   macro avg       0.85      0.78      0.81      1097
weighted avg       0.94      0.94      0.93      1097

