## Declaration of pre-processing functions

In [6]:
import nltk
from nltk.corpus import stopwords
from stemming.porter2 import stem
import re

def splitText(text):
     #split text by spaces and all symbols.
    word_list = re.findall(r"[\w']+", text)
    word_list = [re.sub(r"\w*[\d]+\w*", 'Numxyzabcd',s) for s in word_list]
    return word_list

def removeStopWords(word_list):
    """ Removes stop words from text """
    
    cachedStopWords = set(stopwords.words("english"))    
    filtered_words = [w for w in word_list if not w in cachedStopWords]    
    return filtered_words

def stemWords(word_list):
    stemmedWords = [stem(w) for w in word_list]
    return stemmedWords

def preProcessData(abstract):
    #preprocessing: stopword removal and stemming       
    word_list = splitText(abstract)
    word_list = removeStopWords(word_list)
    word_list = stemWords(word_list)
    return ' '.join(word_list)


In [5]:
a = 'test that numbers like 1 2 3 + - 2 3 1 + , hi /'
preProcessData(a)

'test number like Numxyzabcd Numxyzabcd Numxyzabcd Numxyzabcd Numxyzabcd Numxyzabcd hi'

## Step 1: Remove stop words and stem training set.

In [4]:
import pickle
import pandas as pd

#import training data and preprocess.
df = pd.read_csv('../../data/train_in.csv')
abstracts = df['abstract'].as_matrix()
processedAbstracts = [preProcessData(a) for a in abstracts]
pickle.dump(processedAbstracts, open('../../data/preProcessedListOfAbstracts.pkl',"wb"))


## Split into training and test set.

In [30]:
import numpy as np
from sklearn.model_selection import train_test_split

#load training data and training labels.
processedAbstracts = pickle.load(open('../../data/preProcessedListOfAbstracts.pkl',"rb"))
processedAbstracts = np.asarray(processedAbstracts)  #convert to array.
labels = pd.read_csv('../../data/train_out.csv')
labels = labels['category'].as_matrix()

#split into abstract_train, labels_train, abstract_test, labels_test.
abstract_train, abstract_test, labels_train, labels_test = train_test_split(processedAbstracts, labels, test_size=0.2, 
                                                                            random_state=42)

#save each
pickle.dump(abstract_train, open('../../data/abstract_train.pkl',"wb"))
pickle.dump(abstract_test, open('../../data/abstract_test.pkl',"wb"))
pickle.dump(labels_train, open('../../data/labels_train.pkl',"wb"))
pickle.dump(labels_test, open('../../data/labels_test.pkl',"wb"))

In [10]:
1

1

In [34]:
abstract_train.shape

(70911,)

## Step 2a: Vectorise (freq)

In [32]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing


#vectorise text
processedAbstracts = pickle.load(open('../../data/abstract_train.pkl',"rb"))
vectoriser = CountVectorizer(min_df=1, token_pattern=r'\b\w+\b', binary=False, encoding="utf-8")
x_train = vectoriser.fit_transform(processedAbstracts)

#import training label and encode
labels = pickle.load(open('../../data/labels_train.pkl',"rb"))
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(labels)

#save
pickle.dump(le, open('../../data/labelEncoder.pkl',"wb"))
pickle.dump(vectoriser, open('../../data/vectoriser_freq.pkl',"wb"))
pickle.dump(x_train, open('../../data/x_train_freq.pkl',"wb"))
pickle.dump(y_train, open('../../data/y_train.pkl',"wb"))

## Step 2b: Vectorise (binary)

In [39]:

#vectorise text
processedAbstracts = pickle.load(open('../../data/abstract_train.pkl',"rb"))
vectoriser = CountVectorizer(min_df=1, token_pattern=r'\b\w+\b', binary=True, encoding="utf-8")
x_train = vectoriser.fit_transform(processedAbstracts)


pickle.dump(vectoriser, open('../../data/vectoriser_binary.pkl',"wb"))
pickle.dump(x_train, open('../../data/x_train_binary.pkl',"wb"))

## Step 3: apply vectorisation on test set

In [5]:
import pickle 

vectoriser_binary = pickle.load(open('data/vectoriser_binary.pkl',"rb"))
vectoriser_freq = pickle.load(open('data/vectoriser_freq.pkl',"rb"))
labelEncoder = pickle.load(open('data/labelEncoder.pkl',"rb"))

abstract_test = pickle.load(open('data/abstract_test.pkl',"rb"))
labels_test = pickle.load(open('data/labels_test.pkl',"rb"))

x_test_binary = vectoriser_binary.transform(abstract_test)
x_test_freq = vectoriser_freq.transform(abstract_test)
y_test = labelEncoder.transform(labels_test)

pickle.dump(x_test_binary, open('data/x_test_binary.pkl',"wb"))
pickle.dump(x_test_freq, open('data/x_test_freq.pkl',"wb"))
pickle.dump(y_test, open('data/y_test.pkl',"wb"))

## Step 4: vectorise entire dataset

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import pandas as pd

#load training data and training labels.
processedAbstracts = pickle.load(open('data/preProcessedListOfAbstracts.pkl',"rb"))
processedAbstracts = np.asarray(processedAbstracts)  #convert to array.
labels = pd.read_csv('data/train_out.csv')
labels = labels['category'].as_matrix()

vectoriser = CountVectorizer(min_df=1, token_pattern=r'\b\w+\b', binary=True, encoding="utf-8")
x_train = vectoriser.fit_transform(processedAbstracts)

#import training label and encode
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(labels)

#save
pickle.dump(le, open('data_overall/labelEncoder.pkl',"wb"))
pickle.dump(vectoriser, open('data_overall/vectoriser_binary.pkl',"wb"))
pickle.dump(x_train, open('data_overall/x_train_binary.pkl',"wb"))
pickle.dump(y_train, open('data_overall/y_train.pkl',"wb"))