In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import axis
import regex as re
import contractions
import num2words

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB


import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /Users/grandhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/grandhi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/grandhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/grandhi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/grandhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load data
data = pd.read_csv(
    "./data/nlp-getting-started/train.csv",)

# Encoding target variable
#data["target"] = np.where(data["target"] == "spam", 1, 0)

df = data.copy()


In [3]:
#filling null values with some number

df['keyword'].fillna('', inplace=True)
df['location'].fillna('',inplace=True)

# dropping the duplicates
df.drop_duplicates(subset ='text',keep=False, inplace=True)


# converting to strings

df['keyword'] = df['keyword'].astype('object')
df['location'] = df['location'].astype('object')
df['text'] = df['text'].astype('object')

# adding the columns keyword and text
df['text_final'] = df['keyword'] + ' ' + df['text']


In [4]:
def remove_url(text):
    url = re.compile(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*')
    return url.sub(r'', text)

df['text_final'] = df['text_final'].apply(lambda x: remove_url(x))


#Contractions
df['text_final_1'] = df['text_final'].apply(lambda x: [contractions.fix(word) for word in x.split(' ')])

#joining back the list of items into one string
df['text_final_1'] = [' '.join(map(str, l)) for l in df['text_final_1']]

# Noise Cleaning - spacing, special characters, lowercasing 

df['text_final_1'] = df['text_final_1'].str.lower()
df['text_final_1'] = df['text_final_1'].apply(lambda x: re.sub(r'[^\w\d\s\']+', '', x))
df['text_final_2'] = df['text_final_1'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

#nltk tokenization

df['text_final_1'] = df['text_final_1'].apply(word_tokenize)
df['text_final_2'] = df['text_final_2'].apply(word_tokenize)

# remove stop words

stop_words = set(stopwords.words('english'))

df['text_final_1'] = df['text_final_1'].apply(lambda x: [word for word in x if word not in stop_words])
df['text_final_1'] = [' '.join(map(str, l)) for l in df['text_final_1']]

df['text_final_2'] = df['text_final_2'].apply(lambda x: [word for word in x if word not in stop_words])
df['text_final_2'] = [' '.join(map(str, l)) for l in df['text_final_2']]


# lemmatization

lemma = nltk.WordNetLemmatizer()

df['text_final_2'] = df['text_final_2'].apply(lambda x: [lemma.lemmatize(word) for word in x ])
df['text_final_2'] = [''.join(map(str, l)) for l in df['text_final_2']]




In [10]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
from transformers import pipeline

#https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion?text=I+like+you.+I+love+you

In [25]:
task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, from_tf = True)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [27]:
tweets = list(df['text'])

classifier = pipeline('sentiment-analysis', tokenizer=tokenizer, model=model)


In [None]:
output = classifier(tweets)

output

In [42]:
output

[{'label': 'sadness', 'score': 0.6136348247528076},
 {'label': 'sadness', 'score': 0.5521221160888672},
 {'label': 'joy', 'score': 0.5969904661178589},
 {'label': 'sadness', 'score': 0.6366748809814453},
 {'label': 'sadness', 'score': 0.6902940273284912},
 {'label': 'sadness', 'score': 0.7561469674110413},
 {'label': 'sadness', 'score': 0.7713411450386047},
 {'label': 'optimism', 'score': 0.387606143951416},
 {'label': 'sadness', 'score': 0.5119816064834595},
 {'label': 'sadness', 'score': 0.8248550295829773},
 {'label': 'sadness', 'score': 0.7488895058631897},
 {'label': 'joy', 'score': 0.5495973825454712},
 {'label': 'sadness', 'score': 0.8562643527984619},
 {'label': 'sadness', 'score': 0.7035607099533081},
 {'label': 'sadness', 'score': 0.6558110117912292},
 {'label': 'sadness', 'score': 0.5572502017021179},
 {'label': 'optimism', 'score': 0.8916151523590088},
 {'label': 'optimism', 'score': 0.941937267780304},
 {'label': 'optimism', 'score': 0.5904171466827393},
 {'label': 'optimi

In [34]:
classifier('i like u')

[{'label': 'optimism', 'score': 0.7951059341430664}]

In [47]:
new_list = pd.DataFrame(output)
new_list.label.value_counts()

joy         2626
sadness     2505
optimism    1792
anger        511
Name: label, dtype: int64

In [None]:
tweets_pro = list(df['text_final_2'])

output_new = classifier(tweets_pro)

new_list_pro = pd.DataFrame(output_new)
new_list_pro

In [48]:
new_list_pro.label.value_counts()

sadness     3398
joy         2603
optimism    1291
anger        142
Name: label, dtype: int64

In [38]:
classifier1 = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
classifier1("I love this!")

#https://huggingface.co/j-hartmann/emotion-english-distilroberta-base?text=its+an+annoying+day

[{'label': 'joy', 'score': 0.9771686792373657}]

In [39]:
output1 = classifier1(tweets)

output1

[{'label': 'fear', 'score': 0.9719893336296082},
 {'label': 'neutral', 'score': 0.35063210129737854},
 {'label': 'neutral', 'score': 0.6025216579437256},
 {'label': 'fear', 'score': 0.9800271391868591},
 {'label': 'fear', 'score': 0.96886146068573},
 {'label': 'fear', 'score': 0.8475469946861267},
 {'label': 'fear', 'score': 0.9152116179466248},
 {'label': 'fear', 'score': 0.8990243077278137},
 {'label': 'fear', 'score': 0.5699889659881592},
 {'label': 'fear', 'score': 0.974253237247467},
 {'label': 'sadness', 'score': 0.9121819138526917},
 {'label': 'fear', 'score': 0.8117707371711731},
 {'label': 'fear', 'score': 0.889666736125946},
 {'label': 'fear', 'score': 0.8315337300300598},
 {'label': 'fear', 'score': 0.9804948568344116},
 {'label': 'surprise', 'score': 0.6559241414070129},
 {'label': 'joy', 'score': 0.9817855954170227},
 {'label': 'joy', 'score': 0.9687780141830444},
 {'label': 'surprise', 'score': 0.9201586842536926},
 {'label': 'surprise', 'score': 0.6738577485084534},
 {'l

In [46]:
df1 = pd.DataFrame(output1)
df1.label.value_counts()

fear        2398
neutral     1396
sadness     1176
anger       1024
surprise     772
joy          548
disgust      120
Name: label, dtype: int64

In [41]:
output2 = classifier1(tweets_pro)
output2

[{'label': 'joy', 'score': 0.7274469137191772},
 {'label': 'neutral', 'score': 0.34410372376441956},
 {'label': 'fear', 'score': 0.7801875472068787},
 {'label': 'fear', 'score': 0.36957961320877075},
 {'label': 'anger', 'score': 0.7951268553733826},
 {'label': 'sadness', 'score': 0.48581069707870483},
 {'label': 'sadness', 'score': 0.5292112231254578},
 {'label': 'fear', 'score': 0.6212357878684998},
 {'label': 'fear', 'score': 0.9545875191688538},
 {'label': 'fear', 'score': 0.979130208492279},
 {'label': 'sadness', 'score': 0.8599289059638977},
 {'label': 'sadness', 'score': 0.5431112051010132},
 {'label': 'sadness', 'score': 0.7133536338806152},
 {'label': 'neutral', 'score': 0.37676534056663513},
 {'label': 'anger', 'score': 0.5269209146499634},
 {'label': 'neutral', 'score': 0.8009167313575745},
 {'label': 'joy', 'score': 0.5709410309791565},
 {'label': 'joy', 'score': 0.7741117477416992},
 {'label': 'neutral', 'score': 0.5626007914543152},
 {'label': 'surprise', 'score': 0.499575

In [45]:
df2 = pd.DataFrame(output2)
df2.label.value_counts()

neutral     1687
fear        1638
sadness     1632
anger       1366
joy          514
surprise     503
disgust       94
Name: label, dtype: int64