In [86]:
%tensorflow_version 2.x

In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [88]:
tf.__version__

'2.4.1'

In [89]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test (2).csv
Saving train.csv to train (2).csv


In [90]:
import io
train = pd.read_csv(io.BytesIO(uploaded['train.csv']))

In [91]:
import io
test = pd.read_csv(io.BytesIO(uploaded['test.csv']))

In [92]:
train.columns = ['classid','title','description']
test.columns = ['classid','title','description']

In [93]:
train.head()

Unnamed: 0,classid,title,description
0,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
1,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
2,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
3,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
4,3,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...


In [94]:
train['summary'] = train['title'] + ' ' + train['description']
test['summary'] = test['title'] + ' ' + test['description']

In [95]:
train = train.drop(columns=['title', 'description'])
test = test.drop(columns=['title', 'description'])

In [96]:
labels = {1:'World News', 2:'Sports News', 3:'Business News', 4:'Science-Technology News'}

In [97]:
train['label'] = train['classid'].map(labels)
test['label'] = test['classid'].map(labels)

In [98]:
train = train.drop(columns=['classid'])
test = test.drop(columns=['classid'])
train.head()

Unnamed: 0,summary,label
0,Carlyle Looks Toward Commercial Aerospace (Reu...,Business News
1,Oil and Economy Cloud Stocks' Outlook (Reuters...,Business News
2,Iraq Halts Oil Exports from Main Southern Pipe...,Business News
3,"Oil prices soar to all-time record, posing new...",Business News
4,"Stocks End Up, But Near Year Lows (Reuters) Re...",Business News


In [99]:
import re
import string as s
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [100]:
# split the data into train and test data

X_train, X_test, y_train, y_test = train_test_split(train['summary'], train['label'],
                                                                test_size=0.2, random_state=1)

In [101]:
def remove_punc(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [102]:

train['summary'] = train['summary'].apply(lambda x: remove_punc(x))
test['summary'] = test['summary'].apply(lambda x: remove_punc(x))

In [104]:
def data_cleaner(text):        
    lower_case = text.lower()
    tokens=word_tokenize(lower_case)
    return (" ".join(tokens)).strip()

def remove_stopwords (text):        
    list1=[word for word in text.split() if word not in stopwords.words('english')]
    return " ".join(list1)

train['summary'] = train['summary'].apply(lambda x: data_cleaner(x))
test['summary'] = test['summary'].apply(lambda x: data_cleaner(x))

train['summary'] = train['summary'].apply(lambda x: remove_stopwords(x))
test['summary'] = test['summary'].apply(lambda x: remove_stopwords(x))

In [105]:

X_train, X_test, y_train, y_test = train_test_split(train['summary'], train['label'],
                                                                test_size=0.2, random_state=1)

In [106]:
tokenized_data = []
for i in train['summary']:
    tokenized_data.append(i.split())
print(tokenized_data[:2])

[['carlyle', 'looks', 'toward', 'commercial', 'aerospace', 'reuters', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'groupwhich', 'reputation', 'making', 'welltimed', 'occasionallycontroversial', 'plays', 'defense', 'industry', 'quietly', 'placedits', 'bets', 'another', 'part', 'market'], ['oil', 'economy', 'cloud', 'stocks', 'outlook', 'reuters', 'reuters', 'soaring', 'crude', 'prices', 'plus', 'worriesabout', 'economy', 'outlook', 'earnings', 'expected', 'tohang', 'stock', 'market', 'next', 'week', 'depth', 'thesummer', 'doldrums']]


In [107]:
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from keras.preprocessing import sequence


In [108]:
w2v_model = Word2Vec(tokenized_data, size=50, workers=32, min_count=1, window=3)
print(w2v_model)

Word2Vec(vocab=89738, size=50, alpha=0.025)


In [109]:
token = Tokenizer(89738)
token.fit_on_texts(train['summary'])
token_text = token.texts_to_sequences(train['summary'])
token_text = pad_sequences(token_text)

In [110]:
X = preprocessing.LabelEncoder()
y = X.fit_transform(train['label'])
y = to_categorical(y)
print(y[:5])

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]


In [111]:
# spilt the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(np.array(token_text), y, test_size=0.2)

In [112]:
from keras import preprocessing
import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv1D, MaxPool1D, Dropout, Dense, GlobalMaxPool1D, Embedding, Activation
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, precision_score, recall_score

In [113]:
# building the model

model = Sequential()
model.add(w2v_model.wv.get_keras_embedding(True))
model.add(Dropout(0.2))
model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
model.add(MaxPool1D())
model.add(Dropout(0.2))
model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
model.add(MaxPool1D())
model.add(Dropout(0.2))
model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.2))
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 50)          4486900   
_________________________________________________________________
dropout_19 (Dropout)         (None, None, 50)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, None, 50)          7550      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, None, 50)          0         
_________________________________________________________________
dropout_20 (Dropout)         (None, None, 50)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, None, 100)         15100     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, None, 100)       

In [114]:
# train the model
model.fit(X_train, y_train, batch_size=256, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe34e073cf8>

In [115]:
labels = X.classes_
print(labels)

['Business News' 'Science-Technology News' 'Sports News' 'World News']


In [116]:
# check prediction

predicted = model.predict(X_test)

In [117]:
for i in range(10,50,3):
    print(train['summary'].iloc[i][:50], "...")
    print("Actual category: ", labels[np.argmax(y_test[i])])
    print("predicted category: ", labels[np.argmax(predicted[i])])

need opec pump moreiran gov tehran reuters opec no ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
dollar falls broadly record trade gap new york reu ...
Actual category:  Sports News
predicted category:  Sports News
market head toward value funds little cause celebr ...
Actual category:  Business News
predicted category:  Business News
google ipo faces playboy slipup bidding gets under ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
rand falls shock sa rate cut interest rates trimme ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
google auction begins friday auction shares google ...
Actual category:  Business News
predicted category:  Business News
chad seeks refugee aid imf chad asks imf loan pay  ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
saudi arabia open oil taps saudi arabia says ready ...
Actual category