## Importing Libraries

In [4]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, numpy, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import pandas as pd
import string
import nltk as nl
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
import re

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

## Loading the data

In [6]:

final_data = pd.read_csv('final_data.csv')

## Cleaning the data 

In [None]:

stemmer = SnowballStemmer("english")


stop_words = set(stopwords.words('english')) 

final_data.text = final_data.text.apply(lambda x: str(x).lower())
final_data.text = final_data.text.apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))
final_data.text = final_data.text.apply(lambda x: x.translate(str.maketrans('','', string.digits)))
final_data.text = final_data.text.apply(lambda x: re.sub("[^a-zA-Z0-9]+", " ", x))
final_data.text = final_data.text.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in (stop_words)]))
final_data.text


In [None]:
final_data['labels'] = final_data['class'].map({'Positive':1, 'Negative':0})
#final_data = final_data.drop(['label'],axis=1)
final_data = final_data.dropna()

In [None]:
y = final_data['labels']

from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(final_data['text'], final_data['labels'], test_size=0.2, random_state=0)  



## Feature engineering

### Count Vector 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(final_data.text).toarray() 


### TF IDF Vectors

In [None]:
# word level tf-idf
tfidf_vectors = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vectors.fit(final_data['text'])
x_train_tfidf_words =  tfidf_vectors.transform(X_train)
x_test_tfidf_words =  tfidf_vectors.transform(X_test)

In [None]:
# ngram level tf-idf 
tfidf_vectors_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vectors_ngram.fit(final_data['text'])
x_train_tfidf_ngram =  tfidf_vectors_ngram.transform(X_train)
x_test_tfidf_ngram =  tfidf_vectors_ngram.transform(X_test)

## Building the Random Forest Model

In [None]:
def random_forest_clf(x_train,y_train):
    clf = RandomForestClassifier(n_estimators=1000, random_state=42, min_sample_split=2, criterion = "gini")  
    kf = KFold(n_splits=5)
    kf.get_n_splits(X_train)
    predicted_y = []
    expected_y = []
    i = 0
    print("Evaluation on cross validated data set\n")
    for train_index, test_index in kf.split(x_train, y_train):
            cv_x_train, cv_x_test = x_train[train_index], x_train[test_index]
            cv_y_train, cv_y_test = y_train.iloc[train_index], y_train.iloc[test_index]
            clf.fit(cv_x_train, cv_y_train)
            predicted_cv = clf.predict(cv_x_test)
            print("For K=",i)
            print("Accuracy:",accuracy_score(cv_y_test,predicted_cv))
            print(classification_report(cv_y_test,predicted_cv)) 
            i += 1
    return clf


## Evaluation Results of Model with TFIDF word scores

In [None]:
clf_tfidf_word = random_forest_clf(x_train_tfidf_words, y_train)

y_pred_tf_words = clf_tfidf_word.predict(x_test_tfidf_words)
y_pred_tf_words



In [None]:

print("Accuracy score:",accuracy_score(y_test, y_pred_tf_words))  
print("\nf1-score:",f1_score(y_test, y_pred_tf_words)) 
print("\nPrecision:",precision_score(y_test, y_pred_tf_words)) 
print("\nRecall:",recall_score(y_test, y_pred_tf_words)) 



## ## Evaluation Results of Model with TFIDF ngrams scores

In [None]:
clf_tfidf_n_gram = random_forest_clf(x_train_tfidf_ngram, y_train)

y_pred_tf_ngram = clf_tfidf_n_gram.predict(x_test_tfidf_ngram)
y_pred_tf_ngram


In [None]:
print("Accuracy score:",accuracy_score(y_test, y_pred_tf_ngram))  
print("\nf1-score:",f1_score(y_test, y_pred_tf_ngram)) 
print("\nPrecision:",precision_score(y_test, y_pred_tf_ngram)) 
print("\nRecall:",recall_score(y_test, y_pred_tf_ngram)) 



## Neural Network 

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout
from keras.utils.np_utils import to_categorical

## Creating the word embeddings

### Transforming the texts into sequences and padding them

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#import matplotlib.pyplot as plt
np.random.seed(32)


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.manifold import TSNE


MAX_NB_WORDS = 400000

# get the raw text data
texts_train = X_train.astype(str)
texts_test = X_test.astype(str)
# vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=400000, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences_train = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))


In [None]:
index_to_word = dict((i, w) for w, i in tokenizer.word_index.items())
index_to_word[1383]
" ".join([index_to_word[i] for i in sequences_test[1]])


In [None]:
seq_lens = [len(s) for s in sequences_train]
print("average length: %0.1f" % np.mean(seq_lens))
print("max length: %d" % max(seq_lens))

In [None]:
MAX_SEQUENCE_LENGTH = 500

# pad sequences with 0s
x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)

In [None]:
x_train.shape

In [None]:
#y_train = train_y
#y_test = test_y

train_y = to_categorical(np.asarray(y_train))
test_y =  to_categorical(np.asarray(y_test))
#y_train.shape

### Loading pre trained embeddings

In [None]:

import numpy
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [None]:
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
embedding_matrix

## CBOW shallow model


Parameters
Activation function: Softmax
Optimizer: Adam
Metric: Accuracy


In [None]:
from keras.layers import Dense, Input, Flatten
from keras.layers import GlobalAveragePooling1D,GlobalAveragePooling2D, Embedding
from keras.models import Model
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout
EMBEDDING_DIM = 50
N_CLASSES = 2

# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')


embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

#embedding_layer = Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(sequence_input)
embedded_sequences = embedding_layer(sequence_input)
#conv_layer = Convolution1D(512, 3, activation="relu")(embedding_layer)
average = GlobalAveragePooling1D()(embedded_sequences)

dense_layer = Dense(512,activation='softmax')(average)
drop_layer = Dropout(0.2)(dense_layer)
#dense_layer = Dense(256,activation='softmax')(drop_layer)

predictions = Dense(2, activation='softmax')(drop_layer)

c_bow_model_1 = Model(sequence_input, predictions)
c_bow_model_1.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['acc'])
c_bow_model_1.summary()

In [None]:
from keras.callbacks import ModelCheckpoint  

checkpointer = ModelCheckpoint(filepath='weights.best.xception.hdf5', 
                               verbose=1, save_best_only=True)


In [None]:
c_bow_model_1.fit(x_train, train_y,
          validation_split=0.2,epochs=6, batch_size=20, callbacks=[checkpointer], verbose=1)

In [None]:
output_test_cb_1 = c_bow_model_1.predict(x_test)
print("test auc:", roc_auc_score(y_test,output_test_cb_1[:,1]))

In [None]:
output_test_cb_1[:,0]

In [None]:

print("test auc:", roc_auc_score(y_test,output_test_cb_1[:,0]))

In [None]:
y_classes_cb_1 = output_test_cb_1.argmax(axis=-1) 
y_classes_cb_1

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print("f1-score:", precision_score(y_test,y_classes_cb_1) )
print("precision score:", precision_score(y_test,y_classes_cb_1))
print("recall score:", recall_score(y_test,y_classes_cb_1))

Parameters
Activation function: Softmax
Optimizer: Adam
Metric: Accuracy

In [None]:
EMBEDDING_DIM = 50
N_CLASSES = 2

# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')


embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

#embedding_layer = Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(sequence_input)
embedded_sequences = embedding_layer(sequence_input)
#conv_layer = Convolution1D(512, 3, activation="relu")(embedding_layer)
average = GlobalAveragePooling1D()(embedded_sequences)

dense_layer = Dense(512,activation='softmax')(average)
drop_layer = Dropout(0.2)(dense_layer)
#dense_layer = Dense(256,activation='softmax')(drop_layer)

predictions = Dense(2, activation='softmax')(drop_layer)

c_bow_model_2 = Model(sequence_input, predictions)
c_bow_model_2.compile(loss='binary_crossentropy',
              optimizer='sgd', metrics=['acc'])
c_bow_model_2.summary()

In [None]:
from keras.callbacks import ModelCheckpoint  

epochs = 10

checkpointer = ModelCheckpoint(filepath='weights.best.xception.hdf5', 
                               verbose=1, save_best_only=True)


In [None]:
c_bow_model_2.fit(x_train, train_y,
          validation_split=0.2,epochs=6, batch_size=200, callbacks=[checkpointer], verbose=1)

In [None]:
output_test_cb_2 = c_bow_model_2.predict(x_test)
print("test auc:", roc_auc_score(y_test,output_test_cb_2[:,1]))

In [None]:
y_classes_cb_2 = output_test_cb_2.argmax(axis=-1) 
y_classes_cb_2

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print("f1-score:", precision_score(y_test,y_classes_cb_2) )
print("precision score:", precision_score(y_test,y_classes_cb_2))
print("recall score:", recall_score(y_test,y_classes_cb_2))

## CNN model 

In [None]:
from keras.layers import Dense, Input, Flatten
from keras.layers import GlobalAveragePooling1D,GlobalAveragePooling2D, Embedding
from keras.models import Model
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout
EMBEDDING_DIM = 50
N_CLASSES = 2

# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(sequence_input)
#embedded_sequences = embedding_layer(sequence_input)
conv_layer = Conv1D(512, 3, activation="relu")(embedding_layer)
average = GlobalAveragePooling1D()(conv_layer)

dense_layer = Dense(512,activation='relu')(average)
drop_layer = Dropout(0.2)(dense_layer)
dense_layer = Dense(256,activation='softmax')(drop_layer)
drop_layer = Dropout(0.2)(dense_layer)

predictions = Dense(N_CLASSES, activation='softmax')(dense_layer)

cnn_model = Model(sequence_input, predictions)
cnn_model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['acc'])
cnn_model.summary()


In [None]:
from keras.callbacks import ModelCheckpoint  
checkpointer = ModelCheckpoint(filepath='weights.best.xception.hdf5', 
                               verbose=1, save_best_only=True)


In [None]:
cnn_model.fit(x_train, train_y,
          validation_split=0.2,epochs=6, batch_size=200, callbacks=[checkpointer], verbose=1)

### Evaluation

In [None]:
output_test_cnn = cnn_model.predict(x_test)
print("test auc:", roc_auc_score(y_test,output_test_cnn[:,1]))


In [None]:

y_classes_cnn = output_test_cnn.argmax(axis=-1) 
y_classes_cnn

In [None]:
from sklearn.metrics import f1_score

print("f1 score:", f1_score(y_test,y_classes_cnn))

In [None]:
from sklearn.metrics import precision_score, recall_score
print("precision score:", precision_score(y_test,y_classes_cnn))
print("recall score:", recall_score(y_test,y_classes_cnn))

## Converting the features back to text to determine the true and predicted labels

In [None]:
#for f in x_test:
 #   if f[0]!=0:
  #      print(index_to_word[f[0]])
import pandas as pd
rows, cols = x_test.shape
fin = []
count = 0
test_after = []
for i,k in zip(range(rows),y_classes):
        test_after.append([(" ".join([index_to_word[j] for j in sequences_test[i] if j<330000 and j!=0])),k])
df = pd.DataFrame(test_after,columns = ['text_after','pred_label'])
df['text_prev'] = test_text.values
df['true_label'] = y_test.values
df['index'] = test_text.index
df
#test_after

In [None]:
data_new = pd.read_csv("final_data.csv")


In [None]:
df['original_data'] = [data_new['text'].iloc[i] for i in df['index']]


In [None]:
df['url'] =  [data_new['url'].iloc[i] for i in df['index']]

In [None]:
df

In [None]:
test_after[1]

In [None]:
df['class'] = df['pred_label'].map({1:'Positive', 0:'Negative'})

In [None]:
df_search = df[['original_data','class','url','index']]


In [None]:
df_search