In [8]:
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import time
import pickle

In [9]:
data_path = 'data/ag_news_csv/' #location of dataset

In [10]:
test_data = pd.read_csv(os.path.join(data_path, "test.csv"), names = ["class", "title", "text"]) # read test_data
train_data = pd.read_csv(os.path.join(data_path, "train.csv"), names = ["class", "title", "text"]) # read train_data

In [11]:
def text_preprocessing(text):
    
    text = str(text)
    
    stop = set(stopwords.words('english')) # get stopwords
    porter_stemmer = PorterStemmer() # stemmer

    tokens = [word.lower() for sentence in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sentence)] # tokenize
    
    tokens = [porter_stemmer.stem(word) for word in tokens if word not in stop] # get rid of stop words
    
    tokens = [word for word in tokens if len(word) > 2] # get rid of one size character 
    
    return tokens

In [12]:
def bigram(tokens): # bigram function
    
    bigrm = [] 
    
    for i,t in enumerate(tokens[:-1]):
        bigrm.append(tokens[i] + ' ' + tokens[i+1]) #put words next to each other into one 
        
    return bigrm

In [13]:
a = text_preprocessing(train_data['title'][0])

In [14]:
a

['wall', 'st.', 'bear', 'claw', 'back', 'black', 'reuter']

In [17]:
print("preprocessing starts")

start_time = time.time() #starting time

train_X, train_Y, test_X, test_Y = [], [], [], [] # list for each train_x, train_y, test_x, test_y

# for train data
for i, row in train_data.iterrows(): # put all the tokens resulted from text_preprocessing texts and titles and bigramming texts and titles
    tokens = text_preprocessing([row['title']]) + bigram(text_preprocessing([row['title']])) + text_preprocessing(row['text']) + bigram(text_preprocessing(row['text']))
    train_X.append(tokens)
    
    cls = row['class'] - 1
    train_Y.append(cls)
    
# for test data
for i, row in test_data.iterrows():
    tokens = text_preprocessing([row['title']]) + bigram(text_preprocessing([row['title']])) + text_preprocessing(row['text']) + bigram(text_preprocessing(row['text']))
    test_X.append(tokens)
    
    cls = row['class'] - 1
    test_Y.append(cls)

print("preprocessing finished")
time_elapsed = time.time() - start_time # get the time it took to preprocess
print('Preprocessing complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

preprocessing starts
preprocessing finished
Preprocessing complete in 10m 41s


In [18]:
dataDict = {'train_X':train_X,'train_Y':train_Y,'test_X':test_X,'test_Y':train_Y}

In [19]:
with open("data.pickle", "wb") as fw:
    pickle.dump(dataDict, fw)