In [1]:
#Importing the raw data category wise
import sklearn.datasets as skd
categories = ['FS', 'Notes','Junk']
pages_dict = skd.load_files('/Users/baggu/Downloads/FT_ML_training_Data/Input/', categories= categories, encoding= 'ISO-8859-1')

In [2]:
#Defining the pre-preprocessing steps
import nltk
# Needed only once
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
import re, unidecode, string
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result

def remove_slash_with_space(text): 
    return text.replace('\\', " ")

def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

def text_lowercase(text): 
    return text.lower()     

def remove_whitespace(text): 
    return  " ".join(text.split()) 

def remove_stopwords(text): 
    stop_words = set(stopwords.words("english")) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return ' '.join(filtered_text)

def stem_words(text): 
    stemmer = PorterStemmer() 
    word_tokens = word_tokenize(text) 
    stems = [stemmer.stem(word) for word in word_tokens] 
    return ' '.join(stems)

def lemmatize_words(text): 
    lemmatizer = WordNetLemmatizer() 
    word_tokens = word_tokenize(text) 
    # provide context i.e. part-of-speech 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] 
    return ' '.join(lemmas) 

In [3]:
# Perform preprocessing
def perform_preprocessing(text):
    text = remove_html_tags(text)
    text = remove_accented_chars(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = text_lowercase(text)
    text = remove_slash_with_space(text)
    text = remove_punctuation(text)
    text = stem_words(text)
    #text = lemmatize_words(text)
    text = remove_whitespace(text)
    return text

pages_dict.data = list(map(perform_preprocessing, pages_dict.data))

In [4]:
#Splitting the data into train and test set
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(pages_dict['data'], pages_dict['target'], test_size=0.2, random_state=42)

train_pages_dict = {
    'data' : data_train,
    'target' : target_train
}

test_pages_dict = {
    'data' : data_test,
    'target' : target_test
}

In [8]:
# Assigning vectors to the input data
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_converter = TfidfVectorizer(stop_words='english', min_df=0.03, ngram_range=(1,2))
X_train_tfidf = tfidf_converter.fit_transform(train_pages_dict['data'])
X_train_tfidf.shape

# Saving model to disk
import pickle
pickle.dump(tfidf_converter, open('vectorizer.pkl','wb'))

In [9]:
#Classification using Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, train_pages_dict['target'])

X_test_tfidf = tfidf_converter.transform(test_pages_dict['data'])
predicted = clf.predict(X_test_tfidf) 

from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(Naive Bayes):',accuracy_score(test_pages_dict['target'],predicted))
print(metrics.classification_report(test_pages_dict['target'],predicted,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],predicted)

pickle.dump(clf, open('nbmodel.pkl','wb'))

Accuracy(Naive Bayes): 0.8972766364070712
              precision    recall  f1-score   support

          FS       0.91      0.92      0.91       197
        Junk       0.95      0.88      0.92      1152
       Notes       0.83      0.91      0.87       744

    accuracy                           0.90      2093
   macro avg       0.89      0.91      0.90      2093
weighted avg       0.90      0.90      0.90      2093



array([[ 182,    0,   15],
       [   6, 1018,  128],
       [  13,   53,  678]])

In [10]:
#Classification using Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
classifier = RandomForestClassifier(n_estimators=1200, random_state=1)  #defining 1000 nodes 
rf = classifier.fit(X_train_tfidf, train_pages_dict['target'])  

y_pred = classifier.predict(X_test_tfidf) 

from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(RF):',accuracy_score(test_pages_dict['target'],y_pred))
print(metrics.classification_report(test_pages_dict['target'],y_pred,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],y_pred)

pickle.dump(rf, open('rfmodel.pkl','wb'))

Accuracy(RF): 0.9541328236980411
              precision    recall  f1-score   support

          FS       0.99      0.95      0.97       197
        Junk       0.96      0.96      0.96      1152
       Notes       0.93      0.94      0.94       744

    accuracy                           0.95      2093
   macro avg       0.96      0.95      0.96      2093
weighted avg       0.95      0.95      0.95      2093



array([[ 188,    3,    6],
       [   0, 1106,   46],
       [   1,   40,  703]])

In [11]:
pickle.dump(rf, open('rfmodel.pkl','wb'))

In [12]:
#Classification using Random Forest

# import sys
# !{sys.executable} -m pip install xgboost

from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_tfidf, train_pages_dict['target'])

y_pred_xgb = xgb.predict(X_test_tfidf)

from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(RF):',accuracy_score(test_pages_dict['target'],y_pred_xgb))
print(metrics.classification_report(test_pages_dict['target'],y_pred_xgb,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],y_pred_xgb)

pickle.dump(xgb, open('xgbmodel.pkl','wb'))

Accuracy(RF): 0.9617773530817009
              precision    recall  f1-score   support

          FS       0.99      0.96      0.97       197
        Junk       0.96      0.97      0.97      1152
       Notes       0.95      0.95      0.95       744

    accuracy                           0.96      2093
   macro avg       0.97      0.96      0.96      2093
weighted avg       0.96      0.96      0.96      2093

