<a href="https://colab.research.google.com/github/rid181198/NLP-project-for-tweets/blob/main/Neural_network_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the necessary libraries

In [125]:
#basic libraries
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import math
import pandas as pd
import re
import os


In [126]:
#text processing libraries
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer


#stopwords
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from string import punctuation
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading the training dataset

Here the dataset of all tweets is loaded and blank fields are filled with the empty string. Also, two other columns *tokenized_text* and *tokenized_key* are created to store the tokenized words for both text sentence of the tweet and the keywords, respectively.

In [127]:
data = pd.read_csv('./train.csv')
data.fillna('', inplace=True)

data['tokenized_text'] = " "
data['tokenized_key'] = " "

## Creating the STOPWORDS

*stop_words*, *punctuation*, *gensimwords*, *sklearnwords*, *num_pattern* are regarded as general words like she, he, I will be ignored. Also, *punctuation* referred for the punctuations like ",?!#@, etc. *gensimwords* and *sklearnwords* are the extended version of words like nonetheless, although, otherwise, etc. *num_pattern* finds the numbers and replaces with the empty string.

In [128]:
stop_words = set(stopwords.words('english'))
punctuation = list(punctuation)
gensimwords = STOPWORDS
sklearnwords = ENGLISH_STOP_WORDS
num_pattern = r'[0-9]'

# Function to tokenize/split the words and ignore the STOPWORDS

Here *PorterStemmer* is an object to get the root or basis of words. For instance, playing will be considered as play, cats as cat. Using the functional library *word_tokenize*, the sentence is converted into the tokens or split words and afterwards, the STOPWORDS and stemming are done.

In [129]:

porter = PorterStemmer()
def tokenized_stop(string):
    string = re.sub(num_pattern, '', string)
    string = re.sub(r'http\S+', '', string)
    
    #tokenizing the words
    string = word_tokenize(string)
    
    #ignoring the unnecessary words
    string_list = []
    for words in string:
        words = words.casefold()
        if (words in stop_words) or (words in punctuation) or (words in gensimwords) or (words in sklearnwords):
            pass
        else:
            words = porter.stem(words)
            string_list.append(words)
             
    return string_list



## Storing the tokenized sentence and keywords to newly created columns

In [130]:
for i in range(len(data['text'])):
    data.at[i,'tokenized_text'] = tokenized_stop(data['text'][i])
    data.at[i,'tokenized_key'] = tokenized_stop(data['keyword'][i])

## Combining all the sentences in a list for the categorical features assignment

In [131]:
all_sents=[]
for i in range(len(data['tokenized_text'])):
    string=''
    for j in data['tokenized_text'][i]:
        string = string  + j + ' '
    all_sents.append(string)


## Implementing the preprocessing tokenizer and sequencing to the sentences using *keras*

In [132]:

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sents)
sequenced_sents = tokenizer.texts_to_sequences(all_sents)

In [133]:
length_sent=[]
for i in sequenced_sents:
  length_sent.append(len(i))

maxlen=max(length_sent)
print(maxlen)

total_vocab =[]
for i in sequenced_sents:
  for j in i:
    total_vocab.append(j)

vocab_size = len(set(total_vocab))
print(vocab_size)

22
13958


## Padding the sentence to make every sentence with equal length of words

In [134]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_sequence = pad_sequences(sequenced_sents, maxlen=23)
y_train = data['target']

## Neural network model using the embedding and dense layers

In [113]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(vocab_size+1, 256, input_length=23))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


In [135]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten
from tensorflow.keras.layers import Embedding

model2 = Sequential()
model2.add(Embedding(vocab_size+1, 256, input_length=23))
model2.add(Conv1D(256, 3, activation='relu'))
model2.add(MaxPooling1D(5))
model2.add(Dense(128, activation='relu'))
model2.add(Dense(32, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(1, activation='sigmoid'))


In [136]:
model2.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])
#model2.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])

In [137]:
model_vals = model2.fit(padded_sequence,y_train,epochs=10, batch_size=16, validation_split=0.3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [138]:
test_data = pd.read_csv('./test.csv')
test_data.fillna('', inplace=True)

test_data['tokenized_text'] = " "
test_data['tokenized_key'] = " "


In [139]:
for i in range(len(test_data['text'])):
    test_data.at[i,'tokenized_text'] = tokenized_stop(test_data['text'][i])
    test_data.at[i,'tokenized_key'] = tokenized_stop(test_data['keyword'][i])

In [140]:
all_sents=[]
for i in range(len(test_data['tokenized_text'])):
    string=''
    for j in test_data['tokenized_text'][i]:
        string = string  + j + ' '
    all_sents.append(string)

In [141]:

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sents)
sequenced_sents = tokenizer.texts_to_sequences(all_sents)

In [142]:
length_sent=[]
for i in sequenced_sents:
  length_sent.append(len(i))

maxlen=max(length_sent)
print(maxlen)

total_vocab =[]
for i in sequenced_sents:
  for j in i:
    total_vocab.append(j)

vocab_size = len(set(total_vocab))
print(vocab_size)

23
8377


In [143]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
test_padded_sequence = pad_sequences(sequenced_sents, maxlen=maxlen)


In [144]:
results = model.predict(test_padded_sequence)



In [145]:
predictions=[]
for i in results:
  if i<0.5:
    predictions.append(0)
  if i>=0.5:
    predictions.append(1)

In [146]:
ids = test_data['id']

results2 = {'id':ids,'target':predictions}
results2 = pd.DataFrame(results2)
results2.to_csv('./results2.csv')
