In [1]:
import sklearn.datasets as skd
categories = ['FS', 'Notes','Junk']
pages_dict = skd.load_files('/Users/baggu/Downloads/FT_ML_training_Data/Input/', categories= categories, encoding= 'ISO-8859-1')

In [2]:
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(pages_dict['data'], pages_dict['target'], test_size=0.2, random_state=42)

In [3]:
train_pages_dict = {
    'data' : data_train,
    'target' : target_train
}

test_pages_dict = {
    'data' : data_test,
    'target' : target_test
}

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', min_df=0.03, ngram_range=(1,2)) #I can also specify the range of n-grams
X_train_tf = count_vect.fit_transform(train_pages_dict['data'])
X_train_tf.shape

#0.03 is better than 0.02,0.04

(8372, 1231)

In [5]:
print(count_vect.get_feature_names())

['00', '000', '000 000', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '12 months', '120', '121', '122', '123', '125', '129', '13', '130', '132', '133', '134', '135', '138', '139', '14', '140', '143', '148', '149', '15', '150', '16', '160', '17', '170', '173', '174', '18', '19', '20', '200', '2013', '2014', '2015', '2016', '2017', '2018', '2018 19', '2018 2017', '2018 2019', '2018 annual', '2018 rm', '2019', '2019 20', '2019 2018', '2019 2020', '2019 annual', '2019 notes', '2020', '2020 2019', '2021', '21', '22', '23', '24', '25', '250', '26', '27', '28', '29', '30', '30 june', '300', '31', '31 2015', '31 2017', '31 2018', '31 2019', '31 2020', '31 december', '31 march', '31st', '31st march', '32', '33', '34', '35', '36', '37', '38', '39', '40', '400', '41', '42', '43', '44', '441', '442', '45', '46', '47', '476', '48', '4



In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape

(8372, 1231)

In [7]:
#import autosklearn.classification
#clf = autosklearn.classification.AutoSklearnClassifier()
#clf.fit(X_train_tfidf, train_pages_dict['target'])

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, train_pages_dict['target'])

In [8]:
X_test_tf = count_vect.transform(test_pages_dict['data'])
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = clf.predict(X_test_tfidf) 

In [9]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(Naive Bayes):',accuracy_score(test_pages_dict['target'],predicted))
print(metrics.classification_report(test_pages_dict['target'],predicted,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],predicted)

Accuracy(Naive Bayes): 0.8862876254180602
              precision    recall  f1-score   support

          FS       0.78      0.94      0.85       197
        Junk       0.95      0.87      0.91      1152
       Notes       0.83      0.89      0.86       744

    accuracy                           0.89      2093
   macro avg       0.85      0.90      0.87      2093
weighted avg       0.89      0.89      0.89      2093



array([[ 185,    0,   12],
       [  21, 1007,  124],
       [  32,   49,  663]])

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
classifier = RandomForestClassifier(n_estimators=1200, random_state=1)  #defining 1000 nodes 
rf = classifier.fit(X_train_tfidf, train_pages_dict['target'])  

y_pred = classifier.predict(X_test_tfidf) 

from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(RF):',accuracy_score(test_pages_dict['target'],y_pred))
print(metrics.classification_report(test_pages_dict['target'],y_pred,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],y_pred)

Accuracy(RF): 0.9569995222169135
              precision    recall  f1-score   support

          FS       0.99      0.94      0.97       197
        Junk       0.96      0.97      0.96      1152
       Notes       0.94      0.95      0.94       744

    accuracy                           0.96      2093
   macro avg       0.96      0.95      0.96      2093
weighted avg       0.96      0.96      0.96      2093



array([[ 186,    4,    7],
       [   1, 1112,   39],
       [   1,   38,  705]])

In [11]:
import sys
!{sys.executable} -m pip install xgboost

from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_tfidf, train_pages_dict['target'])

y_pred_xgb = xgb.predict(X_test_tfidf)

from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Accuracy(RF):',accuracy_score(test_pages_dict['target'],y_pred_xgb))
print(metrics.classification_report(test_pages_dict['target'],y_pred_xgb,target_names=pages_dict.target_names))
metrics.confusion_matrix(test_pages_dict['target'],y_pred_xgb)

Accuracy(RF): 0.9646440516005733
              precision    recall  f1-score   support

          FS       0.98      0.96      0.97       197
        Junk       0.97      0.97      0.97      1152
       Notes       0.95      0.95      0.95       744

    accuracy                           0.96      2093
   macro avg       0.97      0.96      0.97      2093
weighted avg       0.96      0.96      0.96      2093



array([[ 190,    2,    5],
       [   2, 1119,   31],
       [   1,   33,  710]])