In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import axis
import regex as re
import contractions
import num2words

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB


import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /Users/grandhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/grandhi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/grandhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/grandhi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/grandhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Load data
data = pd.read_csv(
    "./data/nlp-getting-started/train.csv",)

# Encoding target variable
#data["target"] = np.where(data["target"] == "spam", 1, 0)

df = data.copy()


In [7]:
#filling null values with some number

df['keyword'].fillna('', inplace=True)
df['location'].fillna('',inplace=True)

# dropping the duplicates
df.drop_duplicates(subset ='text',keep=False, inplace=True)


# converting to strings

df['keyword'] = df['keyword'].astype('object')
df['location'] = df['location'].astype('object')
df['text'] = df['text'].astype('str')

# adding the columns keyword and text
df['text_final'] = df['keyword'] + ' ' + df['text']


In [8]:
#Contractions
df['text_final_1'] = df['text_final'].apply(lambda x: [contractions.fix(word) for word in x.split(' ')])

#joining back the list of items into one string
df['text_final_1'] = [' '.join(map(str, l)) for l in df['text_final_1']]

# Noise Cleaning - spacing, special characters, lowercasing 

df['text_final_1'] = df['text_final_1'].str.lower()
df['text_final_1'] = df['text_final_1'].apply(lambda x: re.sub(r'[^\w\d\s\']+', '', x))
df['text_final_2'] = df['text_final_1'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

#nltk tokenization

df['text_final_1'] = df['text_final_1'].apply(word_tokenize)
df['text_final_2'] = df['text_final_2'].apply(word_tokenize)

# remove stop words

stop_words = set(stopwords.words('english'))

df['text_final_1'] = df['text_final_1'].apply(lambda x: [word for word in x if word not in stop_words])
df['text_final_1'] = [' '.join(map(str, l)) for l in df['text_final_1']]

df['text_final_2'] = df['text_final_2'].apply(lambda x: [word for word in x if word not in stop_words])
df['text_final_2'] = [' '.join(map(str, l)) for l in df['text_final_2']]

def remove_url(text):
    url = re.compile(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*')
    return url.sub(r'', text)

df['text_final_url'] = df['text_final_2'].apply(lambda x: remove_url(x))



In [21]:
df.sample(4)

Unnamed: 0,id,keyword,location,text,target,text_final,text_final_1,text_final_2,text_final_url
5267,7529,oil%20spill,,@TroySlaby22 slicker than an oil spill,0,oil%20spill @TroySlaby22 slicker than an oil s...,oil20spill troyslaby22 slicker oil spill,oil spill troyslaby slicker oil spill,oil spill troyslaby slicker oil spill
6775,9707,tornado,,Pretty teen Hayden Ryan poses and strips out o...,0,tornado Pretty teen Hayden Ryan poses and stri...,tornado pretty teen hayden ryan poses strips p...,tornado pretty teen hayden ryan poses strips p...,tornado pretty teen hayden ryan poses strips p...
6352,9082,structural%20failure,,Slums are a manifestation state failure to pro...,1,structural%20failure Slums are a manifestation...,structural20failure slums manifestation state ...,structural failure slums manifestation state f...,structural failure slums manifestation state f...
4487,6383,hostages,The Universe,Cont'd- #Sinjar: referring to a 40-pg document...,1,hostages Cont'd- #Sinjar: referring to a 40-pg...,hostages cont 'd sinjar referring 40pg documen...,hostages cont sinjar referring pg document gro...,hostages cont sinjar referring pg document gro...


In [None]:
df.sample(4)

In [9]:
tokenized = df['text_final_2'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
np.array(padded).shape

(7434, 51)

In [12]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(7434, 51)

In [13]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [14]:
features = last_hidden_states[0][:,0,:].numpy()

In [16]:
labels = df["target"]

In [17]:
# Split dataset between train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0)

In [18]:
clf = LogisticRegression()
clf.fit(X_train,y_train)

LogisticRegression()

In [20]:

predictions = clf.predict(X_test)

print("AUC = {:.3f}".format(roc_auc_score(y_test, predictions)))
print("accuracy = {:.3f}".format(accuracy_score(y_test, predictions)))
print("precision = {:.3f}".format(precision_score(y_test, predictions)))
print("recall = {:.3f}".format(recall_score(y_test, predictions)))

AUC = 0.794
accuracy = 0.807
precision = 0.812
recall = 0.707


In [19]:
clf.score(X_test, y_test)

0.8074233458848843