In [None]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()

In [None]:
import nltk
nltk.download('stopwords')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_bodies=pd.read_csv('train_bodies.csv')
train_stances=pd.read_csv('train_stances.csv')

In [None]:
train_data=pd.merge(train_bodies,train_stances,how='left',on='Body ID')

In [None]:
test_stance = pd.read_csv('competition_test_stances.csv')
test_body = pd.read_csv('test_bodies.csv')

In [None]:
test_data  = pd.merge( test_body, test_stance,how='left', on='Body ID')

In [None]:
target={'unrelated':0, 'agree':1, 'discuss':2, 'disagree':3}
train_data['Stance']=train_data['Stance'].map(target)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import string
stop_words=set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocessing(text):
    
    text = text.rstrip(',|.|;|:|\'|\"')
    text = text.lstrip('\'|\"')
    #remove numbers
    text=re.sub(r'\d+','',text)
    #punctuation removal
    translator=str.maketrans('','',string.punctuation)
    text=text.translate(translator)
    text=" ".join(text.split())
    #stop words removal
    word_tokens=word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    text=" ".join(filtered_text)
    #stemming
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    text=" ".join(stems)
    #lowering the text
    text=text.lower()
    
    return text

In [None]:
train_data['new_headline']=train_data['Headline'].map(preprocessing)
train_data['new_articlebody']=train_data['articleBody'].map(preprocessing)
test_data['new_Headline']=test_data['Headline'].map(preprocessing)
test_data['new_articlebody']=test_data['articleBody'].map(preprocessing)

In [None]:
test_data['Stance']=test_data['Stance'].map(target)

In [None]:
#preprocessed train data
xtrain_prepdata=train_data[['new_headline','new_articlebody']]
ytrain_prepdata=train_data['Stance']
#unpreprocessed train data
xtrain_unprepdata=train_data[['Headline','articleBody']]
ytrain_unprepdata=train_data['Stance']
#preprocessed text  data
xtest_prepdata=test_data[['new_Headline','new_articlebody']]
ytest_prepdata=test_data['Stance']
#unpreprocessed text data
xtest_unprepdata=test_data[['Headline','articleBody']]
ytest_unprepdata=test_data['Stance']

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
X_train_prep_combined = train_data['new_headline'] + train_data['new_articlebody']
X_train_prep_combined = pd.DataFrame(X_train_prep_combined)
X_train_prep_combined.columns = ['combined']
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_prep_combined['combined'])
words_to_index = tokenizer.word_index

In [None]:
tokens1=[]
def tokens(text):
    tokens=word_tokenize(text)
    tokens1.append(tokens)
train_data['new_headline'].apply(tokens)

## Using Word2Vec 

In [None]:
from gensim.models import Word2Vec

# Preprocess and tokenize your text data
sentences = tokens1

# Train a Word2Vec model on the tokenized text data
model = Word2Vec(sentences, min_count=1)

# Get the embedding matrix
embedding_matrix = model.wv.vectors

## Using Glove Embedding

In [None]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
  return word_to_vec_map

In [None]:
word_to_vec_map = read_glove_vector('glove.6B.100d.txt')

In [None]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector


In [None]:
X_test_prep_combined = test_data['new_Headline'] + test_data['new_articlebody']
X_test_prep_combined = pd.DataFrame(X_test_prep_combined)
X_test_prep_combined.columns = ['combined']

In [None]:
train_head = tokenizer.texts_to_sequences(X_train_prep_combined['combined'])
train_head = pad_sequences(train_head, maxlen=5000, padding='post')
test_head = tokenizer.texts_to_sequences(X_test_prep_combined['combined'])
test_head = pad_sequences(test_head, maxlen=5000, padding='post')

## Using PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2240)
temp = pca.fit_transform(train_head)

In [None]:
arr = pca.components_
importance = abs(arr).max(axis=0)

In [None]:
train_x = train_head[:, importance.argsort()[::-1][:30]]

In [None]:
test_x = test_head[:,importance.argsort()[::-1][:30]]

## CNN-LSTM Model

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Input, Lambda, LSTM, Bidirectional
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D
from keras.layers import concatenate

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
import tensorflow as tf


# Build the model with the PCA-transformed data
head_input = Input(shape=(2240,), dtype='int32', name='head_input')
embedding_layer = Embedding(embedding_matrix.shape[0],100,weights=[embedding_matrix],input_length=max(count),trainable=False)
head_embed = embedding_layer(head_input)
head_embed = Dropout(0.2)(head_embed)

pool_1 = MaxPooling1D(pool_size=3, strides=2, name='pool3')
conv_1 = Conv1D(64, 5, activation='relu', name='conv1')
head_CNN = conv_1(head_embed)
head_CNN = Dropout(0.2)(head_CNN)
head_CNN = pool_1(head_CNN)

lstm = LSTM(100)(head_CNN)

dense = Dense(4, activation='softmax')(lstm)

model = Model(inputs=head_input, outputs=[dense])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

model.summary()

In [None]:
pca_model = model.fit(train_x,ytrain_prepdata,validation_data = (test_x,ytest_prepdata),epochs=4, batch_size=32,verbose = True)

In [None]:
y_pred = pca_model.predict([test_head,test_body])

In [None]:
labels = np.argmax(y_pred, axis=-1)    
print(labels)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest_prepdata,labels)
cm

In [None]:
import seaborn as sns
sns.heatmap(cm, annot=True)

In [None]:
plt.plot(pca_model.history['val_acc'],label = 'val_accuracy')
plt.plot(pca_model.history['acc'],label = 'accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(pca_model.history['val_acc'],label = 'val_accuracy')
plt.plot(pca_model.history['acc'],label = 'accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()