In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import string
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")




In [2]:


# Read the CSV file, providing the correct file path
file_path = "training.1600000.processed.noemoticon.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-15", names=['Target', 'ID', 'Date', 'Flag', 'User', 'Text'])

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Target,ID,Date,Flag,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
data = df[["Text","Target"]]

In [4]:
data["Target"][data["Target"]==4]=1

In [5]:
data_pos = data[data["Target"]==1].iloc[:int(20000)]
data_neg = data[data["Target"]==0].iloc[:int(20000)]

In [6]:
data = pd.concat([data_pos, data_neg])

In [7]:
data["Text"]=data["Text"].str.lower()

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stop = stopwords.words('english')

In [10]:
from nltk.corpus import stopwords

stop_words = set(stop)

def clean_words(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

data["Text"] = data["Text"].apply(lambda text: clean_words(text))
data["Text"].head()

800000                love @health4uandpets u guys r best!!
800001    im meeting one besties tonight! cant wait!! - ...
800002    @darealsunisakim thanks twitter add, sunisa! g...
800003    sick really cheap hurts much eat real food plu...
800004                      @lovesbrooklyn2 effect everyone
Name: Text, dtype: object

In [11]:
punctuations = string.punctuation

def clean_punc(text):
    return text.translate(str.maketrans('','', punctuations))

data["Text"] = data["Text"].apply(lambda text: clean_punc(text))
data["Text"].head()

800000                   love health4uandpets u guys r best
800001    im meeting one besties tonight cant wait  girl...
800002    darealsunisakim thanks twitter add sunisa got ...
800003    sick really cheap hurts much eat real food plu...
800004                       lovesbrooklyn2 effect everyone
Name: Text, dtype: object

In [12]:
def clean_repeats(text):
    return re.sub(r'(.)\1+', r'\1', text)

data["Text"] = data["Text"].apply(lambda text: clean_repeats(text))
data["Text"].head()

800000                   love health4uandpets u guys r best
800001    im meting one besties tonight cant wait girl talk
800002    darealsunisakim thanks twiter ad sunisa got me...
800003    sick realy cheap hurts much eat real fod plus ...
800004                         lovesbroklyn2 efect everyone
Name: Text, dtype: object

In [13]:
def clean_email(text):
    return re.sub('@[^\s]+',' ', text)

data["Text"] = data["Text"].apply(lambda text: clean_email(text))
data["Text"].head()

800000                   love health4uandpets u guys r best
800001    im meting one besties tonight cant wait girl talk
800002    darealsunisakim thanks twiter ad sunisa got me...
800003    sick realy cheap hurts much eat real fod plus ...
800004                         lovesbroklyn2 efect everyone
Name: Text, dtype: object

In [14]:
def clean_URLs(text):
    return re.sub('((www\.[^\s]+) | (https?://[^\s]+))','',text)

data["Text"] = data["Text"].apply(lambda text: clean_URLs(text))
data["Text"].head()

800000                   love health4uandpets u guys r best
800001    im meting one besties tonight cant wait girl talk
800002    darealsunisakim thanks twiter ad sunisa got me...
800003    sick realy cheap hurts much eat real fod plus ...
800004                         lovesbroklyn2 efect everyone
Name: Text, dtype: object

In [15]:
def clean_nums(text):
    return re.sub('[0-9]+','',text)

data["Text"] = data["Text"].apply(lambda text: clean_nums(text))
data["Text"].head()

800000                    love healthuandpets u guys r best
800001    im meting one besties tonight cant wait girl talk
800002    darealsunisakim thanks twiter ad sunisa got me...
800003    sick realy cheap hurts much eat real fod plus ...
800004                          lovesbroklyn efect everyone
Name: Text, dtype: object

In [16]:
token = RegexpTokenizer(r'\w+')
data["Text"] = data["Text"].apply(token.tokenize)
data["Text"].head()

800000             [love, healthuandpets, u, guys, r, best]
800001    [im, meting, one, besties, tonight, cant, wait...
800002    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800003    [sick, realy, cheap, hurts, much, eat, real, f...
800004                      [lovesbroklyn, efect, everyone]
Name: Text, dtype: object

In [17]:
st = nltk.PorterStemmer()

def stemming(text):
    return [st.stem(word) for word in text]

data["Text"] = data["Text"].apply(lambda text: stemming(text))
data["Text"].head()

800000               [love, healthuandpet, u, guy, r, best]
800001    [im, mete, one, besti, tonight, cant, wait, gi...
800002    [darealsunisakim, thank, twiter, ad, sunisa, g...
800003    [sick, reali, cheap, hurt, much, eat, real, fo...
800004                       [lovesbroklyn, efect, everyon]
Name: Text, dtype: object

In [18]:
import nltk
nltk.download('wordnet')

lm = nltk.WordNetLemmatizer()

def lemmatizing(text):
    return [lm.lemmatize(word) for word in text]

data["Text"] = data["Text"].apply(lambda text: lemmatizing(text))
data["Text"].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


800000               [love, healthuandpet, u, guy, r, best]
800001    [im, mete, one, besti, tonight, cant, wait, gi...
800002    [darealsunisakim, thank, twiter, ad, sunisa, g...
800003    [sick, reali, cheap, hurt, much, eat, real, fo...
800004                       [lovesbroklyn, efect, everyon]
Name: Text, dtype: object

In [19]:
X = data["Text"]
y = data["Target"]

In [20]:
max_len = 500
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, y, test_size=0.20, random_state=2)

In [22]:
def tensorflow_based_model(): #Defined tensorflow_based_model function for training tenforflow based model
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(2000,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer) 
    model = Model(inputs=inputs,outputs=layer)
    return model

In [23]:
model = tensorflow_based_model() # here we are calling the function of created model




In [24]:
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [25]:
history=model.fit(X_train,Y_train,batch_size=80,epochs=6, validation_split=0.1)

Epoch 1/6


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [26]:
accr1 = model.evaluate(X_test,Y_test)



In [27]:
print(accr1[1])

0.7448750138282776
