In [1]:
#Import revelant libraries
import pandas as pd
import numpy as np
import nltk
import string
import re
#Downloading revelant content
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# Tensorflow (Model)
import tensorflow as tf

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing Dataset

**Dataset consists of two types set:**
* Training Set
* Test or Validation Set

In [2]:
# Renaming Columns name 
colnames=['TweetId', 'Entity', 'Output', 'Tweet']

data = pd.read_csv('/content/twitter_validation.csv', names=colnames, header=None)

In [3]:
#Main list consists of cleaned data
main = []

# Storing all punctuations using RE library like !;,"% etc
re_puncs = re.compile('[%s]' % re.escape(string.punctuation))
# Storing all stop words like a, an, the, when, there, this etc
stop_word  = set(stopwords.words('english'))
# Making Lemmatizing object
lem = WordNetLemmatizer()
# Using Porter Stemmer
p_stem = PorterStemmer()

# Traversing whole dataset
for i in tqdm(range(len(data['Tweet']))):
    # Tokenization
    tokens = word_tokenize(str(data['Tweet'][i]))
    # Converting all characters to lower case
    tokens = [w.lower() for w in tokens]
    # Remove all punctuations from sentenses
    tokens = [re_puncs.sub('', w) for w in tokens]
    # Checking all words is alphabets or not
    tokens = [i for i in tokens if i.isalpha()]
    # Removing all stop words from the sentenses
    tokens = [w for w in tokens if w not in stop_word]
    # Doing Lemmatizing of words
    tokens = [lem.lemmatize(w) for w in tokens]
    # Stemming process
    tokens = [p_stem.stem(w) for w in tokens]
    # Finally convert to string
    r = ' '.join(tokens)
    # Storing the final string into main list
    main.append(r)

100%|██████████| 1000/1000 [00:02<00:00, 351.86it/s]


In [4]:
# Display the first five rows of dataframe
data.head()

Unnamed: 0,TweetId,Entity,Output,Tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [5]:
# Assigning a new column in the dataframe
data['Preprocess_Tweet'] = main
# Dropping old column which consists unstructred data =
data = data.drop('Tweet', axis = 1)

In [6]:
# Display the first five rows of dataframe
data.head()

Unnamed: 0,TweetId,Entity,Output,Preprocess_Tweet
0,3364,Facebook,Irrelevant,mention facebook struggl motiv go run day tran...
1,352,Amazon,Neutral,bbc news amazon bo jeff bezo reject claim comp...
2,8312,Microsoft,Negative,microsoft pay word function poorli samsungu ch...
3,4371,CS-GO,Negative,csgo matchmak full closet hack truli aw game
4,4433,Google,Neutral,presid slap american face realli commit unlaw ...


In [7]:
# Converting the main into Vector using CountVectorizer and then convert it to array
cnt = CountVectorizer(analyzer="word")
X = cnt.fit_transform(main).toarray()

In [8]:
# Unique values in output column
data['Output'].unique()

array(['Irrelevant', 'Neutral', 'Negative', 'Positive'], dtype=object)

In [9]:
# Checking any NULL values
data['Output'].isnull().sum()

0

In [10]:
# As output has four output so we convert labels to binary numbers
enc = OneHotEncoder(handle_unknown='ignore')
ip = np.array(data['Output'])
ip = ip.reshape(-1, 1)
y = enc.fit_transform(ip).toarray()

In [11]:
#Printing X (independent values)
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
# Priniting (Dependent values)
y

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.]])

In [13]:
# Splitting data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
# Display training set (Independent Values)
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
# Display training set (dependent values)
y_train

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [16]:
X_train.shape

(800, 3786)

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

In [18]:
tf_insta = TfidfTransformer()

In [19]:
X_train_filter = tf_insta.fit_transform(X_train).toarray()

In [20]:
X_train_filter.shape

(800, 3786)

In [21]:
X_train_filter.shape

(800, 3786)

In [22]:
y_train.shape

(800, 4)

In [23]:
#Test model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1024, input_dim=3786, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])
model.compile(
     loss='categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)

In [24]:
h = model.fit(X_train_filter, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
