In [111]:
#Mount drive files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

import seaborn as sns
import matplotlib.pyplot as plt

In [115]:
##pubmed stopwords
text = ['a, ,abbreviations, about, again, all, almost, also, although, always, among, an, and, another, any, are, as, at\
,be, because, been, before, being, between, both, but, by\
,can, could\
,did, do, does, done, due, during\
,each, either, enough, especially, etc\
,for, found, from, further\
,had, has, have, having, here, how, however\
,i, if, in, into, is, it, its, iv, itself\
,just\
,kg, km\
,made, mainly, make, may, mg, might, ml, mm, most, mostly, must\
,nearly, neither, new, no, nor\
,obtained, of, often, on, our, overall\
,perhaps, pmid, previously\
,quite\
,rather, really, regarding, result\
,seem, seen, several, should, show, showed, shown, shows, significantly, since, so, some, such\
,than, that, the, their, theirs, them, then, there, therefore, these, they, this, those, through, thus, to\
,upon, use, used, using\
,various, very\
,was, we, were, what, when, which, while, with, within, without, would']

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


pwords = text[0].split(',')
pubmed_words = [w.strip() for w in pwords]
stop_words = set(stopwords.words("english"))


def get_all_stopwords():
    return stop_words.union(pubmed_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
ab_path = 'drive/My Drive/Year5/csc4006 Final Project/data/abstract_id_text_label.csv'
main_path = 'drive/My Drive/Year5/csc4006 Final Project/data/full_body_id_text_label.csv'

In [117]:
##df = pd.read_csv(ab_path, sep='\t')
df = pd.read_csv(main_path, sep='\t')
print(df.shape)
df = df.dropna()
print(df.shape)

(22681, 2)
(22672, 2)


In [0]:
le = LabelEncoder()

In [0]:
df['class_label'] = le.fit_transform(df['label'])
X, y = df['text'], df['class_label']
##x_train,x_test,y_train,y_test = train_test_split(df['text'], df['class_label'], random_state = 10, test_size = 0.3)

In [120]:
len(X)

22672

In [121]:
from nltk.stem import WordNetLemmatizer
import re
nltk.download('wordnet')

stemmer = WordNetLemmatizer()
def preprocess_text(X):
  documents = []
  for sen in X:  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(sen))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)
  return documents 
    

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
documents  = preprocess_text(X)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=2000, min_df=5, max_df=0.7, stop_words=get_all_stopwords())  
X = vectorizer.fit_transform(documents).toarray()  

In [124]:
print(vectorizer.vocabulary_)
with open('main_vocab.pkl', 'wb') as f:
  pickle.dump(vectorizer.vocabulary_, f)


{'neural': 1205, 'ensemble': 592, 'dynamic': 546, 'dorsal': 532, 'motor': 1171, 'cortex': 397, 'speech': 1710, 'people': 1327, 'edu': 557, 'contributed': 381, 'equally': 603, 'ca': 215, 'usa': 1920, 'engineering': 587, 'western': 1971, 'medical': 1113, 'center': 250, 'hospital': 835, 'school': 1610, 'institute': 923, 'brain': 206, 'science': 1612, 'brown': 211, 'general': 762, 'neuroscience': 1209, 'bio': 181, 'program': 1427, 'behavior': 166, 'whose': 1975, 'basis': 155, 'difficult': 496, 'resolution': 1557, 'neuron': 1206, 'human': 841, 'measurement': 1106, 'lack': 989, 'animal': 86, 'recorded': 1496, 'electrode': 570, 'array': 114, 'hand': 805, 'area': 111, 'implicated': 871, 'movement': 1173, 'challenge': 258, 'division': 523, 'major': 1071, 'body': 197, 'scale': 1605, 'word': 1983, 'trial': 1890, 'recording': 1497, 'interface': 938, 'population': 1374, 'feature': 685, 'reported': 1539, 'arm': 112, 'initial': 910, 'condition': 343, 'signal': 1666, 'followed': 726, 'suggests': 1778,

In [0]:
#from sklearn.feature_extraction.text import TfidfTransformer  
#tfidfconverter = TfidfTransformer()  
#X = tfidfconverter.fit_transform(X).toarray()  

In [0]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)  

In [127]:
classifier = RandomForestClassifier(n_estimators=100, random_state=0)  
classifier.fit(X_train, y_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [0]:
y_pred = classifier.predict(X_test)
y_pred_prob = classifier.predict_proba(X_test)

In [129]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss

#print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  
print(log_loss(y_test, y_pred_prob))

              precision    recall  f1-score   support

           0       0.72      0.11      0.19       118
           1       0.51      0.41      0.46       201
           2       0.71      0.05      0.09       109
           3       0.60      0.84      0.70       657
           4       0.62      0.58      0.60       233
           5       0.65      0.64      0.64       253
           6       0.50      0.60      0.54       367
           7       0.00      0.00      0.00        20
           8       0.64      0.60      0.62       225
           9       0.64      0.73      0.68       268
          10       0.61      0.68      0.64       197
          11       0.60      0.77      0.67       388
          12       0.57      0.42      0.49       384
          13       0.43      0.33      0.38       466
          14       0.70      0.45      0.55       184
          15       0.57      0.86      0.68       638
          16       0.39      0.12      0.18       222
          17       0.73    

  'precision', 'predicted', average, warn_for)


In [0]:
##from sklearn.externals import joblib
##filename = 'finalized_model.sav'
##joblib.dump(classifier, filename)  

In [0]:
import pickle
with open('bagofwords_classifier_maintext.pkl', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile, -1)