# Importing all necessary libraries and our data

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/TheSocialDilemma/TheSocialDilemma.csv')

In [4]:
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,Sentiment
0,Mari Smith,"San Diego, California",Premier Facebook Marketing Expert | Social Med...,2007-09-11 22:22:51,579942,288625,11610,False,2020-09-16 20:55:33,@musicmadmarc @SocialDilemma_ @netflix @Facebo...,,Twitter Web App,False,Neutral
1,Mari Smith,"San Diego, California",Premier Facebook Marketing Expert | Social Med...,2007-09-11 22:22:51,579942,288625,11610,False,2020-09-16 20:53:17,@musicmadmarc @SocialDilemma_ @netflix @Facebo...,,Twitter Web App,False,Neutral
2,Varun Tyagi,"Goa, India",Indian | Tech Solution Artist & Hospitality Ex...,2009-09-06 10:36:01,257,204,475,False,2020-09-16 20:51:57,Go watch “The Social Dilemma” on Netflix!\n\nI...,,Twitter for iPhone,False,Positive
3,Casey Conway,"Sydney, New South Wales",Head of Diversity & Inclusion @RugbyAU | It's ...,2012-12-28 21:45:06,11782,1033,12219,True,2020-09-16 20:51:46,I watched #TheSocialDilemma last night. I’m sc...,['TheSocialDilemma'],Twitter for iPhone,False,Negative
4,Charlotte Paul,Darlington,Instagram Charlottejyates,2012-05-28 20:43:08,278,387,5850,False,2020-09-16 20:51:11,The problem of me being on my phone most the t...,['TheSocialDilemma'],Twitter for iPhone,False,Positive


## Spliting our data into 2 equal parts

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Sentiment'], test_size=0.20)

In [6]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(16054,) (16054,) (4014,) (4014,)


### Removing neutral comments

In [7]:
X_train = X_train[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]
X_test = X_test[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]
y_train = y_train[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]
y_test = y_test[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]


### Cleaninig text

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
pip install transformers

In [10]:
stop_words = stopwords.words("english")

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub("@\S+", " ", text)
    text = re.sub("https*\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\d", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}',' ', text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text

In [12]:
X_train = X_train.apply(lambda x:clean_text(x))
X_test = X_test.apply(lambda x:clean_text(x))

## Converting "Positive" and "Negative" into 1.0 and 0.0 for further calculations

In [13]:
def MakingLabel(text):
  if text == "Positive":
    return 1.0
  if text == "Negative":
    return 0.0

In [14]:
y_train = y_train.apply(lambda x:MakingLabel(x))
y_test = y_test.apply(lambda x:MakingLabel(x))

In [15]:
y_train = np.asarray(y_train).astype("float32")
y_test = np.asarray(y_test).astype("float32")

In [16]:
x_train = ''
for i in X_train:
    x_train += i + ' '
x_test = ''
for i in X_test:
    x_test += i + ' '

In [17]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [18]:
def tokenize(sequence):
    tokens = tokenizer.encode_plus(sequence, max_length=512,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [19]:
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens
def lemmatizer(text):
    lemm_text = [WordNetLemmatizer().lemmatize(word) for word in text]
    return lemm_text
x_train = lemmatizer(tokenization(x_train))
x_test = lemmatizer(tokenization(x_test))

In [37]:
num_of_elements = 10433

Xids = np.zeros((num_of_elements, 512))
Xmask = np.zeros((num_of_elements, 512))



In [38]:
idx = np.random.randint(0, 10433, num_of_elements)  

In [39]:
small_dataset = np.array(x_train)[idx.astype(int)]
small_dataset_labels = np.array(y_train)[idx.astype(int)]

In [40]:
for i, sequence in enumerate(small_dataset):
    tokens = tokenize(sequence)
    Xids[i, :], Xmask[i, :] = tokens[0], tokens[1]

In [41]:
from transformers import TFAutoModel

In [28]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [42]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      TFBaseModelOutputWithPoo  108310272 
                             lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             512, 768),                          
                              pooler_output=(None, 76            
                             8),                                 
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=None, cross_attent            
                             ions=None)                          
                                                                 
Total params: 108,310,272
Trainable params: 0
Non-trainable params: 108,310,272
_______________________________________

In [43]:
import tensorflow as tf

input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)

# Classifier head
x = tf.keras.layers.Dense(1024, activation ='relu')(embeddings)
y = tf.keras.layers.Dense(1, activation ='sigmoid', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable = False

In [44]:
bert.bert(input_ids, attention_mask=mask) # outputs of bert

TFBaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                                 <KerasTensor: shape=(None, 512, 768) dtype=float32 (created by layer 'bert')>),
                                                ('pooler_output',
                                                 <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'bert')>)])

In [45]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                         

In [46]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

In [48]:
history = model.fit(
    [Xids, Xmask], y_train,
    validation_split=0.8,
    batch_size = 16,
    verbose = 1,
    epochs=1)

