In [1]:
#Importing the text files as a key-value pair/dictionary

import sklearn.datasets as skd
categories = ['FS', 'Notes','Junk']

#provide the directory to the Input folder. For example, my category sub-folders are present in Input folder.
pages_dict = skd.load_files('/Users/baggu/Downloads/FT_ML_training_Data/Input/', categories= categories, encoding= 'ISO-8859-1')

In [2]:
#splitting the text data into test and train set

from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(pages_dict['data'], pages_dict['target'], test_size=0.2, random_state=42)

In [3]:
#Creating train and test dictionary for easy access

train_pages_dict = {
    'data' : data_train,
    'target' : target_train
}

test_pages_dict = {
    'data' : data_test,
    'target' : target_test
}

In [4]:
import re
import nltk
#from nltk.stem import WordNetLemmatizer
#lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
# init stemmer
porter_stemmer=PorterStemmer()

def my_cool_preprocessor(text):
    text=text.lower()
    text=re.sub(r'\d+', '', text)
    text=re.sub("\\W"," ",text) # remove special chars
    #text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    
    # stem words
    words=re.split("\\s+",text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    #stemmed_words=[lemmatizer.lemmatize(word=word) for word in words]
    return ' '.join(stemmed_words)
#cv = CountVectorizer(cat_in_the_hat_docs,preprocessor=my_cool_preprocessor)
#count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [5]:
#Tokenizing the text document, eliminating english stop words, removing words with very less frequency
#Creating count matrix
#We also need to stem the stop_words

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english',min_df=0.03, ngram_range=(1,2), preprocessor=my_cool_preprocessor) 
X_train_tf = count_vect.fit_transform(train_pages_dict['data'])
X_train_tf.shape



(8372, 978)

In [6]:
#Creating tf-idf matrix/ feature extraction using the count matrix

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape

(8372, 978)

In [7]:
#Classification model using Naive Bayes

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, train_pages_dict['target']) #Naive Bayes model created

X_test_tf = count_vect.transform(test_pages_dict['data'])
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = clf.predict(X_test_tfidf) #prediction of test data

#Summary
from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(Naive Bayes):',accuracy_score(test_pages_dict['target'],predicted))
print(metrics.classification_report(test_pages_dict['target'],predicted,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],predicted)

Accuracy(Naive Bayes): 0.9001433349259437
              precision    recall  f1-score   support

          FS       0.90      0.93      0.92       197
        Junk       0.95      0.89      0.92      1152
       Notes       0.83      0.91      0.87       744

    accuracy                           0.90      2093
   macro avg       0.89      0.91      0.90      2093
weighted avg       0.90      0.90      0.90      2093



array([[ 184,    0,   13],
       [   8, 1021,  123],
       [  13,   52,  679]])

In [8]:
#Classification model using random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
classifier = RandomForestClassifier(n_estimators=1200, random_state=1)  #defining 1000 nodes 
rf = classifier.fit(X_train_tfidf, train_pages_dict['target'])  

y_pred = classifier.predict(X_test_tfidf) 

#Summary
from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(RF):',accuracy_score(test_pages_dict['target'],y_pred))
print(metrics.classification_report(test_pages_dict['target'],y_pred,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],y_pred)

Accuracy(RF): 0.9550883898709985
              precision    recall  f1-score   support

          FS       0.99      0.96      0.98       197
        Junk       0.96      0.96      0.96      1152
       Notes       0.93      0.94      0.94       744

    accuracy                           0.96      2093
   macro avg       0.96      0.96      0.96      2093
weighted avg       0.96      0.96      0.96      2093



array([[ 189,    3,    5],
       [   0, 1107,   45],
       [   1,   40,  703]])

In [9]:
#Classification model using xgb

import sys
!{sys.executable} -m pip install xgboost

from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_tfidf, train_pages_dict['target'])

y_pred_xgb = xgb.predict(X_test_tfidf)

#Summary
from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(RF):',accuracy_score(test_pages_dict['target'],y_pred_xgb))
print(metrics.classification_report(test_pages_dict['target'],y_pred_xgb,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],y_pred_xgb)

Accuracy(RF): 0.9627329192546584
              precision    recall  f1-score   support

          FS       0.99      0.96      0.97       197
        Junk       0.97      0.97      0.97      1152
       Notes       0.95      0.95      0.95       744

    accuracy                           0.96      2093
   macro avg       0.97      0.96      0.96      2093
weighted avg       0.96      0.96      0.96      2093



array([[ 189,    3,    5],
       [   0, 1118,   34],
       [   2,   34,  708]])

In [10]:
pip install auto-sklearn

Collecting auto-sklearn
  Using cached auto_sklearn-0.14.7-py3-none-any.whl
Collecting pynisher<0.7,>=0.6.3
  Using cached pynisher-0.6.4-py3-none-any.whl
Collecting distro
  Using cached distro-1.7.0-py3-none-any.whl (20 kB)
Collecting liac-arff
  Using cached liac_arff-2.5.0-py3-none-any.whl
Collecting ConfigSpace<0.5,>=0.4.21
  Using cached ConfigSpace-0.4.21-cp39-cp39-macosx_10_9_x86_64.whl (882 kB)
Collecting smac<1.3,>=1.2
  Using cached smac-1.2-py3-none-any.whl
Collecting pyrfr<0.9,>=0.8.1
  Using cached pyrfr-0.8.2.tar.gz (296 kB)
Collecting distributed>=2012.12
  Using cached distributed-2022.5.0-py3-none-any.whl (856 kB)
Collecting emcee>=3.0.0
  Using cached emcee-3.1.2-py2.py3-none-any.whl (46 kB)
Building wheels for collected packages: pyrfr
  Building wheel for pyrfr (setup.py) ... [?25lerror
[31m  ERROR: Command errored out with exit status 1:
   command: /Users/baggu/opt/anaconda3/bin/python -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/private

In [12]:
# from autosklearn.classification import AutoSklearnClassifier
# model = AutoSklearnClassifier(time_left_for_this_task=600,
#                             max_models_on_disc=5,
#                             metric = average_precision,
#                             scoring_functions=[roc_auc, average_precision, accuracy, f1, precision, recall, log_loss])
# # perform the search
# model.fit(X_train_tfidf, train_pages_dict['target'])
# model.sprint_statistics()