In [3]:
import pandas as pd
import numpy as np
import nltk

from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential 
from keras.layers import Dense, Dropout, Embedding, LSTM, CuDNNLSTM, Flatten
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/renatapva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/renatapva/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/renatapva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/renatapva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/renatapva/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [4]:
# import pre-processed files
twitter = pd.read_csv("twitter_pre_processed.csv");
wiki = pd.read_csv("wiki_pre_processed.csv");

# merging datasets
data = pd.concat([twitter, wiki]);

# remove stop words
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]));

# stemming
stemmer = SnowballStemmer('english')
data['text'] = data['text'].apply(stemmer.stem)

# apply POS tags to DF
textList = data['text'].tolist();
tagged_texts = pos_tag_sents(map(word_tokenize, textList));
data['POS'] = tagged_texts;

In [5]:
tags = []

for pos in data.POS:
    tag_list = [x[1] for x in pos]
    tag_str = " ".join(tag_list)
    tags.append(tag_str)

In [6]:
pos_vectorizer = TfidfVectorizer(max_features=2000)
pos = pos_vectorizer.fit_transform(pd.Series(tags)).toarray()

In [7]:
vectorizer = TfidfVectorizer(max_features=2000)
texts = vectorizer.fit_transform(pd.Series(data.text)).toarray()

In [8]:
X = np.concatenate([texts,pos],axis=1)

In [9]:
# separate train and test datasets
y = data['class'];
X_train_final, X_test_final, y_train, y_test = train_test_split(X, y, test_size=0.2)

# creating dictionary
X_train_final = np.array(X_train_final)
X_test_final = np.array(X_test_final)

X_train_final = np.reshape(X_train_final, (X_train_final.shape[0], 1, X_train_final.shape[1]))
X_test_final = np.reshape(X_test_final, (X_test_final.shape[0], 1, X_test_final.shape[1]))

In [10]:
# define network architecture and compile 
model = Sequential()
model.add(CuDNNLSTM(100, input_shape=(1,2034))) 
model.add(Dense(250, activation='relu')) 
model.add(Dropout(0.2)) 
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 

In [11]:
# train model
history = model.fit(X_train_final, y_train, epochs=5, verbose=2, validation_split=0.1)

Train on 132817 samples, validate on 14758 samples
Epoch 1/5
 - 25s - loss: 0.1893 - acc: 0.9344 - val_loss: 0.1693 - val_acc: 0.9404
Epoch 2/5
 - 19s - loss: 0.1650 - acc: 0.9421 - val_loss: 0.1670 - val_acc: 0.9420
Epoch 3/5
 - 19s - loss: 0.1558 - acc: 0.9450 - val_loss: 0.1653 - val_acc: 0.9431
Epoch 4/5
 - 19s - loss: 0.1484 - acc: 0.9470 - val_loss: 0.1643 - val_acc: 0.9434
Epoch 5/5
 - 19s - loss: 0.1418 - acc: 0.9489 - val_loss: 0.1710 - val_acc: 0.9413


In [None]:
# check score
score = model.evaluate(X_test_final, y_test)
print(score)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_final)
print(classification_report(y_test, y_pred.round()))