In [10]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import tensorflow as tf
import torch 
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.data.sampler import SequentialSampler
import transformers as ppb 
from transformers import BertTokenizer

import warnings
warnings.filterwarnings('ignore')


# Loading the complete dataset into a pandas dataframe.
print("Loading ")
#df = pd.read_csv("/home/renato/Datasets/CrisisLexT6-v1.0/CrisisLexT6/crisisLexT6.csv", encoding='utf-8')
#df = pd.read_csv("/home/renato/Datasets/CrisisLexT6-v1.0/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv", encoding='utf-8')

df = pd.read_csv("/home/joao/crisisLexT6.csv", encoding='utf-8')

print()
print('Number of sentences in the original dataset: {:,}\n'.format(df.shape[0]))

#Relabelling the columns titles to remove white spaces
df = df.rename(columns={' tweet': 'sentence'})
df = df.rename(columns={' label': 'label'})

df['label'].replace('on-topic', 1)
df['label'] = df['label'].replace('on-topic', 1)

df['label'].replace('off-topic', 0)
df['label'] = df['label'].replace('off-topic', 0)


labels = df['label'].values
sentences = df['sentence']


#Dropping useless columns as I will only be using the tweet text and the corresponding label
df = df[['sentence','label']]
print(df.keys())
print(df['label'].value_counts())

df.head(5)


def lexical_diversity(text):
    return len(set(text.split())) / len(text.split()) 

#Returns the number of characters in a string.
df['nchars'] = df['sentence'].str.len()

#Returns the number of words in a string.
df['nwords'] = df['sentence'].str.split().str.len()

# Checks whether the sentence contains # hashtags
df['bhash'] = df["sentence"].str.contains(pat = '#',flags=re.IGNORECASE, regex = True).astype(int) 

# Count occurrences of #
df['nhash'] = df["sentence"].str.count('#') 

# Check whether the sentence contains URLs
df['blink']  = df["sentence"].str.contains(pat = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', flags=re.IGNORECASE, regex = True) .astype(int)

# Count occurrences of URLs
df['nlink'] = df["sentence"].str.count(pat = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', flags=re.IGNORECASE) 

#Checks whether the sentence contains @
df['bat'] = df["sentence"].str.contains(pat = '@',flags=re.IGNORECASE, regex = True).astype(int) 

#Count occurrences of  @
df['nat'] = df["sentence"].str.count(pat = '@') 

#Checks whether the sentence has retweet or not  
df['rt'] = df["sentence"].str.contains(pat = '@rt|rt@',flags=re.IGNORECASE, regex = True).astype(int) 

# Checks whether the sentence contains phone number
#df['phone'] = df["sentence"].str.contains(pat = '\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})',flags=re.IGNORECASE, regex = True).astype(int) 


df['dlex'] = df["sentence"].apply(lexical_diversity)

# Lowering case
df["sentence"] = df["sentence"].str.lower()

# List of  US slangs.
slangList = ['ASAP','BBIAB','BBL','BBS','BF','BFF','BFFL','BRB','CYA','DS','FAQ','FB','FITBLR','FLBP','FML','FTFY','FTW','FYI','G2G','GF','GR8','GTFO','HBIC','HML','HRU','HTH','IDK','IGHT','IMO','IMHO','IMY','IRL','ISTG','JK','JMHO','KTHX','L8R','LMAO','LMFAO','LMK','LOL','MWF','NM','NOOB','NP','NSFW','OOAK','OFC','OMG','ORLY','OTOH','RN','ROFL','RUH','SFW','SOML','SOZ','STFU','TFTI','TIL','TMI','TTFN','TTYL','TWSS','U','W/','WB','W/O','WYD','WTH','WTF','WYM','WYSIWYG','Y','YMMV','YW','YWA']
slangList = [x.lower() for x in slangList]

#happy emojis
happy_emojis = [':\)', ';\)', '\(:']

#sad emojis
sad_emojis = [':\(', ';\(', '\):']

punctuation = ['.',',','...','?','!',':',';']    
#','-','+','*','_','=','/','','%',' &','{','}','[',']','(',')','

#Checks if the sentence contains slang
mask = df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(slangList)))
df1 = df[~mask]
df['slang'] = mask.astype(int) 

#Checks if the sentence contains happy emojis
mask = df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(happy_emojis)), regex = True)
df1 = df[~mask]
df['hemojis'] = mask.astype(int) 

#Checks if the sentence contains happy emojis
mask = df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(sad_emojis)), regex = True)
df1 = df[~mask]
df['semojis'] = mask.astype(int) 

features =  df[['nchars', 'nwords','bhash','nhash','blink','nlink','bat','nat','rt','slang','dlex']]



Loading 
Index(['tweet id', ' tweet', ' label'], dtype='object')

Number of sentences in the original dataset: 10,008

Index(['sentence', 'label'], dtype='object')
1    6138
0    3870
Name: label, dtype: int64


KeyError: "None of [Index(['nchars', 'nwords', 'bhash', 'nhash', 'blink', 'nlink', 'bat', 'nat',\n       'rt', 'slang', 'dlex'],\n      dtype='object')] are in the [columns]"

In [145]:
                        
### BERT

                        #### Doing all the text pre processing
        


# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')
    

# If there's a GPU available...
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")   

#labels = df['label']
sentences = df['sentence']
sentences.head()

### Remove URL, RT, mention(@)
df.ProcessedText = df.sentence.str.replace(r'http(\S)+', r'')
df.ProcessedText = df.ProcessedText.str.replace(r'http ...', r'')
df.ProcessedText[df.ProcessedText.str.contains(r'http')]
df.ProcessedText = df.ProcessedText.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
df.ProcessedText[df.ProcessedText.str.contains(r'RT[ ]?@')]
df.ProcessedText = df.ProcessedText.str.replace(r'@[\S]+',r'')
df.ProcessedText = df.ProcessedText.str.replace(r'_[\S]?',r'')

#Remove extra space
df.ProcessedText = df.ProcessedText.str.replace(r'[ ]{2, }',r' ')

#Removing &, < and >
df.ProcessedText = df.ProcessedText.str.replace(r'&amp;?',r'and')

#Remove extra space
df.ProcessedText = df.ProcessedText.str.replace(r'&lt;',r'<')
df.ProcessedText = df.ProcessedText.str.replace(r'&gt;',r'>')

#Insert space between words and punctuation marks
df.ProcessedText = df.ProcessedText.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
df.ProcessedText = df.ProcessedText.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

#Lowercased and strip
df.ProcessedText = df.ProcessedText.str.lower()
df.ProcessedText = df.ProcessedText.str.strip()

sentences = df.ProcessedText

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

max_len = 0
# For every sentence...
for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    
    
print('Max sentence length: ', max_len)

# Load pretrained model/tokenizer
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
model = model_class.from_pretrained(pretrained_weights)

# Tell pytorch to run this model on the GPU.
model.cuda()
model.to(device)

# Tokenization
tokenized = sentences.apply((lambda x: tokenizer.encode(x,add_special_tokens=True)))
#Padding
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
# Masking
attention_mask = np.where(padded != 0, 1, 0)

## Now deep learning !
####  DEEP LEARNING
input_ids = torch.tensor(padded).to(device)
attention_mask = torch.tensor(attention_mask).to(device)
#labels = torch.tensor(labels).to(device)
labels = torch.tensor(df["label"].values).to(device)

dataset = TensorDataset(input_ids, attention_mask, labels)

batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
dataloader = DataLoader(dataset, sampler = SequentialSampler(dataset),batch_size = batch_size)

dfLabels = pd.DataFrame()
dfFeatures = pd.DataFrame()

# For each batch of training data...
for batch in dataloader:
    with torch.no_grad():
        # Unpack this training batch from our dataloader. 
        # As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        #model.zero_grad() 
        last_hidden_states = model(b_input_ids,attention_mask = b_input_mask)
        features = last_hidden_states[0][:,0,:]#Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence.  The way BERT does sentence classification, is that it adds a token called [CLS] (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.
        features = features.cpu().detach().numpy()        
        labels = b_labels.cpu().detach().numpy()
        dfLabels = dfLabels.append(pd.DataFrame(labels),ignore_index = True)
        dfFeatures = dfFeatures.append(pd.DataFrame(features),ignore_index = True)
        
# Model 2. Train and Test Split 
# The output from BERT is going to be input to SKLEARN

train_features, test_features, train_labels, test_labels = train_test_split(dfFeatures, dfLabels,test_size=0.33, random_state=42)

combine_df = pd.concat([dfFeatures, features],axis=1)

train_features, test_features, train_labels, test_labels = train_test_split(combine_df, dfLabels,test_size=0.33, random_state=42)

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=42)

# Now we are going to train Logistic Regression model
# We now train the LogisticRegression model. If you've chosen to do the gridsearch, you can plug the value of C into the model declaration (e.g. LogisticRegression(C=5.2)).

lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

dt_clf = DecisionTreeClassifier()
dt_clf.fit(train_features, train_labels)

svm_clf = svm.SVC(gamma=0.001, C=100.)
svm_clf.fit(train_features, train_labels)

rf_clf = RandomForestClassifier()
rf_clf.fit(train_features, train_labels)

ab_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
ab_clf.fit(train_features, train_labels)

nb_clf = GaussianNB()
nb_clf.fit(train_features, train_labels)

nn_clf = MLPClassifier(random_state=1, max_iter=300)
nn_clf.fit(train_features, train_labels)

#Evaluating Model #2
#So how well does our model do in classifying sentences? One way is to check the accuracy against the testing dataset:
lr_clf.score(test_features, test_labels)
dt_clf.score(test_features, test_labels)
rf_clf.score(test_features, test_labels)
ab_clf.score(test_features, test_labels)
nb_clf.score(test_features, test_labels)
nn_clf.score(test_features, test_labels)
svm_clf.score(test_features, test_labels)

y_pred = lr_clf.predict(test_features)
y_pred = dt_clf.predict(test_features)
y_pred = rf_clf.predict(test_features)
y_pred = ab_clf.predict(test_features)
y_pred = nb_clf.predict(test_features)
y_pred = nn_clf.predict(test_features)
y_pred = svm_clf.predict(test_features)

print('Accuracy:', accuracy_score(test_labels, y_pred))
print('F1 score:', f1_score(test_labels, y_pred, average='macro'))
print('Recall:', recall_score(test_labels, y_pred, average='macro'))
print('Precision:', precision_score(test_labels, y_pred, average='macro'))
print('\n clasification report:\n', classification_report(test_labels,y_pred))
print('\n confussion matrix:\n',confusion_matrix(test_labels, y_pred))

In [None]:
from sklearn.inspection import permutation_importance

# Logistic regression get importance
importance = lr_clf.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

importance = dt_clf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
    
# get importance
importance = rf_clf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
    
    
results = permutation_importance(rf_clf, test_features, test_labels, n_repeats=10,random_state=42, n_jobs=2,scoring='accuracy')
# get importance
importance = results.importances_mean
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

    

    ### RF feature importance###
importances = rf_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
for f in range(train_features.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()


result = permutation_importance(rf, X_test, y_test, n_repeats=10,random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
fig, ax = plt.subplots()
ax.boxplot(result.importances[sorted_idx].T,vert=False, labels=X_test.columns[sorted_idx])
ax.set_title("Permutation Importances (test set)")
fig.tight_layout()
plt.show()

NameError: name 'dfFeatures' is not defined

In [None]:


## We try to combine the features 
#import pandas as pd
#
#df1 = pd.DataFrame({'A': ['A0', 'A1'],
#                    'B': ['B0', 'B1'],
#                    'C': ['A4', 'A5'],
#                   })


#df2 = pd.DataFrame({'D': ['B4', 'B5']})
#result = pd.concat([df1, df2],axis=1)
#result
#df = pd.DataFrame(data=sentences, columns=["features"])


#metrics.accuracy_score(y_true, y_pred, \*[, …]) Accuracy classification score.
#metrics.auc(x, y) Compute Area Under the Curve (AUC) using the trapezoidal rule
#metrics.average_precision_score(y_true, …) Compute average precision (AP) from prediction scores
#metrics.balanced_accuracy_score(y_true, …) Compute the balanced accuracy
#metrics.brier_score_loss(y_true, y_prob, \*) Compute the Brier score.
#metrics.classification_report(y_true, y_pred, \*) Build a text report showing the main classification metrics.
#metrics.cohen_kappa_score(y1, y2, \*[, …]) Cohen’s kappa: a statistic that measures inter-annotator agreement.
#metrics.confusion_matrix(y_true, y_pred, \*) Compute confusion matrix to evaluate the accuracy of a classification.
#metrics.dcg_score(y_true, y_score, \*[, k, …])Compute Discounted Cumulative Gain.
#metrics.f1_score(y_true, y_pred, \*[, …]) Compute the F1 score, also known as balanced F-score or F-measure
#metrics.fbeta_score(y_true, y_pred, \*, beta) Compute the F-beta score
#metrics.hamming_loss(y_true, y_pred, \*[, …])Compute the average Hamming loss.
#metrics.hinge_loss(y_true, pred_decision, \*)Average hinge loss (non-regularized)
#metrics.jaccard_score(y_true, y_pred, \*[, …])Jaccard similarity coefficient score
#metrics.log_loss(y_true, y_pred, \*[, eps, …]) Log loss, aka logistic loss or cross-entropy loss.
#metrics.matthews_corrcoef(y_true, y_pred, \*)Compute the Matthews correlation coefficient (MCC)
#metrics.multilabel_confusion_matrix(y_true, …)Compute a confusion matrix for each class or sample
#metrics.ndcg_score(y_true, y_score, \*[, k, …])Compute Normalized Discounted Cumulative Gain.
#metrics.precision_recall_curve(y_true, …)Compute precision-recall pairs for different probability thresholds
#metrics.precision_recall_fscore_support(…)Compute precision, recall, F-measure and support for each class
#metrics.precision_score(y_true, y_pred, \*)Compute the precision
#metrics.recall_score(y_true, y_pred, \*[, …])Compute the recall
#metrics.roc_auc_score(y_true, y_score, \*[, …])Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
#metrics.roc_curve(y_true, y_score, \*[, …])Compute Receiver operating characteristic (ROC)
#metrics.zero_one_loss(y_true, y_pred, \*[, …])Zero-one classification loss.


Unnamed: 0,A,B,C,D
0,A0,B0,A4,B4
1,A1,B1,A5,B5


In [None]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


df = pd.read_csv("/home/joao/2012_Sandy_Hurricane-ontopic_offtopic.csv", encoding='utf-8')





# Create a 90-10 train-validation split. Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))


# The DataLoader needs to know our batch size for training, so we specify it here. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset), batch_size = batch_size )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(val_dataset,sampler = SequentialSampler(val_dataset),batch_size = batch_size)



# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())


print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

  


for step, batch in enumerate(train_dataloader):
    b_input_ids = batch[0].to(device,dtype=torch.int64)
    b_input_mask = batch[1].to(device,dtype=torch.int64)
    b_labels = batch[2].to(device,dtype=torch.int64)
    model.zero_grad()
    
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2,output_attentions = False,output_hidden_states = True,)


# For each epoch...
for epoch_i in range(0, 2):
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        OUTPUT = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
       
       
       
       
       len(OUTPUT)
       OUTPUT[0].shape
       OUTPUT[1].shape
       OUTPUT[2].shape
       
       OUTPUT[0]
       OUTPUT[1]
       OUTPUT[2]
       
       loss, logits = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
       loss, logits = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
       loss, logits = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)

      