In [1]:
import numpy as np
import pandas as pd
import string
import re
import collections
from num2word import word
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt
import html.parser as parser
from sklearn.feature_extraction.text import TfidfVectorizer
import os

In [2]:
train_file_path = os.path.abspath(os.path.join(os.pardir,'data','train.csv'))
test_file_path = os.path.abspath(os.path.join(os.pardir,'data','test.csv'))

# Import Data Set

Consists of class index 1-4 where 1-World, 2-Sports, 3-Business, 4-Sci/Tech

In [3]:
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
data = pd.concat([train,test],axis=0)

In [4]:
#remove html text encoding and #39;
data = pd.concat([train,test],axis=0)
data['Title'] = data['Title'].apply(parser.unescape)
data['Description'] = data['Description'].apply(parser.unescape)
data['Title'] = data['Title'].str.replace("#39;", "\'")
data['Description'] = data['Description'].str.replace("#39;", "\'")

In [5]:
print(train.shape,test.shape,data.shape)

(120000, 3) (7600, 3) (127600, 3)


In [6]:
data.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


# Remove Punctuation and Stopwords 

In [7]:
data['Title'] = data['Title'].apply(lambda s: re.sub(r'[^a-z0-9]',' ',s.lower()))
data['Description'] = data['Description'].apply(lambda s: re.sub(r'[^a-z0-9]',' ',s.lower()))

In [8]:
def convert_num_to_word(words):
    result = []
    for w in words:
        if w.isnumeric():
            result.extend(map(lambda x: x.lower(),word(w).split()))
        else:
            result.append(w)
    return result

data['Title'] = data['Title'].str.split().apply(convert_num_to_word)
data['Description'] = data['Description'].str.split().apply(convert_num_to_word)

In [9]:
def remove_stopword(words):
    result = []
    for word in words:
        if word not in STOPWORDS:
            result.append(word)
    return result

data['Title'] = data['Title'].apply(remove_stopword)
data['Description'] = data['Description'].apply(remove_stopword)

In [10]:
def remove_single_character(words):
    result = []
    for word in words:
        if len(word) > 1:
            result.append(word)
    return result  

data['Title'] = data['Title'].apply(remove_single_character)
data['Description'] = data['Description'].apply(remove_single_character)

# Lemmatization

In [11]:
def lemmatization(words):
    lemmatizer = WordNetLemmatizer()
    result = []
    for word in words:
        result.append(lemmatizer.lemmatize(word))
    return result

data['Title'] = data['Title'].apply(lemmatization)
data['Description'] = data['Description'].apply(lemmatization)

In [12]:
data['Title'] = data['Title'].apply(convert_num_to_word)
data['Description'] = data['Description'].apply(convert_num_to_word)
data['Title'] = data['Title'].apply(remove_stopword)
data['Description'] = data['Description'].apply(remove_stopword)
data['Title'] = data['Title'].apply(remove_single_character)
data['Description'] = data['Description'].apply(remove_single_character)

# Combine Title and Description

In [13]:
data['Documents'] = data['Title'] + data['Description']

In [14]:
data['Word Count'] = data['Documents'].apply(lambda x: len(x))

In [15]:
data.head()

Unnamed: 0,Class Index,Title,Description,Documents,Word Count
0,3,"[wall, st, bear, claw, back, black, reuters]","[reuters, short, seller, wall, street, dwindli...","[wall, st, bear, claw, back, black, reuters, r...",18
1,3,"[carlyle, look, toward, commercial, aerospace,...","[reuters, private, investment, firm, carlyle, ...","[carlyle, look, toward, commercial, aerospace,...",27
2,3,"[oil, economy, cloud, stock, outlook, reuters]","[reuters, soaring, crude, price, plus, worry, ...","[oil, economy, cloud, stock, outlook, reuters,...",24
3,3,"[iraq, halt, oil, export, main, southern, pipe...","[reuters, authority, halted, oil, export, flow...","[iraq, halt, oil, export, main, southern, pipe...",28
4,3,"[oil, price, soar, time, record, posing, new, ...","[afp, tearaway, world, oil, price, toppling, r...","[oil, price, soar, time, record, posing, new, ...",28


# Convert List to String

In [16]:
data['Documents'] = data['Documents'].apply(lambda x: ' '.join(map(str,x)))
data.drop(['Title','Description'],axis=1,inplace=True)

# Create Features 
### count: noun phrase, noun, verb, adjective

In [17]:
data['Noun Phrases'] = data['Documents'].apply(lambda x: len(TextBlob(x).noun_phrases))

In [18]:
data['Tags'] = data['Documents'].apply(lambda t: collections.Counter(tag for word,tag in TextBlob(t).tags))

KeyboardInterrupt: 

In [None]:
# pos tag list https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/

data['Noun Count'] = data['Tags'].apply(lambda d: d.get('NN',0)+d.get('NNS',0)+d.get('NNP',0)+d.get('NNPS',0))
data['Adjective Count'] = data['Tags'].apply(lambda d: d.get('JJ',0)+d.get('JJR',0)+d.get('JJS',0))
data['Verb Count'] = data['Tags'].apply(lambda d: d.get('VB',0)+d.get('VBD',0)+d.get('VBG',0)+d.get('VBN',0)+d.get('VBP',0)+d.get('VBZ',0))
data.drop(['Tags'],axis=1,inplace=True)

# Convert Integer Categories to News Categories

In [None]:
#1-World, 2-Sports, 3-Business, 4-Sci/Tech
news_categories = ['','world','sports','business','science_and_technology']

#convert integer categories into corresponding text categories
data['Class Index'] = data['Class Index'].astype(str)
data = data.rename(columns={'Class Index': 'News Category'})

for i in range(len(news_categories)):
    data['News Category'] = data['News Category'].str.replace(str(i), news_categories[i])

# Export Cleaned Data as CSV

In [None]:
data.head()

In [None]:
data_file_path = os.path.abspath(os.path.join(os.pardir,'data','cleaned_AG.csv'))
data_file_path

In [None]:
data.to_csv(data_file_path,index=False)