<a href="https://colab.research.google.com/github/shruthireddyrekula/Image_classification_tf/blob/main/Text_Classification_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run the following code to ensure colab uses only TensorFlow 2.x:

In [None]:
try: 
  %tensorflow_version 2.x 
except Exception: 
  pass

Import necessary library including TensorFlow and Keras

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd 
import re 
from matplotlib import pyplot as plt
import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Importing the train tweets

In [5]:
tweets = pd.read_csv('/content/drive/My Drive/train_tweets.csv')


Seperate the tweet text and the labels using the following code snippet:

In [8]:
X = tweets.iloc[:, 2].values
y = tweets.iloc[:, 1].values


Text Cleaning and Preprocessing

In [9]:

def clean_corpus(text):
    corpus = []
    for i in range(len(text)):
        tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i]))
        tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet)
        tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet)
        tweet = tweet.lower()
        tweet = re.sub(r"can't","can not", tweet)
        tweet = re.sub(r"hv","have", tweet)
        tweet = re.sub(r"ur","your", tweet)
        tweet = re.sub(r"ain't","is not", tweet)
        tweet = re.sub(r"don't","do not", tweet)
        tweet = re.sub(r"couldn't","could not", tweet)
        tweet = re.sub(r"shouldn't","should not", tweet )
        tweet = re.sub(r"won't","will not", tweet)
        tweet = re.sub(r"there's", "there is", tweet)
        tweet = re.sub(r"it's","it is", tweet)
        tweet = re.sub(r"that's","that is", tweet)
        tweet = re.sub(r"where's","where is", tweet)
        tweet = re.sub(r"who's","who is", tweet)
        tweet = re.sub(r"\W"," ", tweet)
        tweet = re.sub(r"\d"," ", tweet)
        tweet = re.sub(r"[ðâï¼½³ªãºæååçæåä¹µó¾_ëìêè]"," ", tweet)
        tweet =re.sub(r"\s[a-z]\s"," ", tweet)
        tweet = re.sub(r"\s+[a-z]\s+"," ", tweet)
        tweet = re.sub(r"^[a-z]\s"," ", tweet)
        tweet = re.sub(r"^[a-z]\s+"," ", tweet)
        tweet = re.sub(r"\s+"," ", tweet)
        tweet = re.sub(r"^\s","", tweet)
        tweet = re.sub(r"\s$","", tweet)
        corpus.append(tweet)
        
    #return the corpus
    return corpus

        

Tokenizing the text to feed to the model

In [None]:
#check how many individual words present in the corpus
corpus = clean_corpus(X)
word_dict = {}
for doc in corpus:
    words = nltk.word_tokenize(doc)
    for word in words:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1
            
print(len(word_dict))


#tokenising the texts
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus_tokens = tokenizer.texts_to_sequences(corpus)

Padding text sequences

In [None]:
#finding the average words present per comment
print(corpus[0])
print(corpus_tokens[0:2])

num_of_words_in_doc =[]
for doc in corpus_tokens:
    num_of_words_in_doc.append(len(doc))
print(num_of_words_in_doc)
print("Average number of words: ", np.average(num_of_words_in_doc))



# Padding the sequences
corpus_pad = keras.preprocessing.sequence.pad_sequences(corpus_tokens,maxlen=25,padding='post')



In [None]:
# Creating Validation Set

X_train,X_test,y_train,y_test = train_test_split(corpus_pad,y,test_size=0.2,random_state=101)


In [None]:
# Building & Compiling the model

vocab_size = len(tokenizer.word_index) + 1
max_length = 25
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim=vocab_size,output_dim=50,input_length=max_length))
model.add(keras.layers.LSTM(units=50,dropout=0.2,recurrent_dropout=0.2))
model.add(keras.layers.Dense(units=1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

In [None]:

# Train the model
model.fit(X_train,y_train,batch_size=10,epochs=2, verbose=2)

In [None]:
from keras.utils import pad_sequences
#Loading the test data
test_tweets = pd.read_csv("/content/drive/My Drive/test_tweets_with_hate1.csv")
print(test_tweets.shape)

#cleaning the text
test_data = test_tweets['tweet']
clean_test_data  = clean_corpus(test_data)

#text to sequence and padding
clean_test_data_token = tokenizer.texts_to_sequences(clean_test_data)
clean_test_data_pad = pad_sequences(clean_test_data_token,maxlen=25,padding='post')

In [None]:
# preparing the submission file    '
#final_prediction=model.predict(clean_test_data_pad) 
final_prediction = (model.predict(clean_test_data_pad) > 0.5).astype("int32")
#final_prediction=np.argmax(prediction)
#final_prediction = model.predict_classes(clean_test_data_pad)

test_tweets['label'] = final_prediction
test_predictions = test_tweets[['id','label']]
test_predictions.to_csv('LSTM3.csv',index=False)