In [1]:
#importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer # for text vectorizing
from sklearn.model_selection import train_test_split # for splitting the data

# for trainig and saving th model
from sklearn.metrics import roc_auc_score 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import pickle

#  librariesfor cleaning the text
import neattext as nt
import neattext.functions as nfx
import re
import string

In [2]:
#importing the training data
traindf = pd.read_csv("training.csv")

In [3]:
traindf

Unnamed: 0,ID,Tweet,Labels
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10
1,2,BanMediaHouse whose is responsible for spreadi...,6
2,3,Im waiting for someone to say to me that all t...,3 4
3,4,He is a liar. Proven day night. Time again. Li...,6
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",8
...,...,...,...
4995,4996,Life Insurance? I wonder if policies are payin...,4 5 7
4996,4997,My cousin passed away from the corona virus to...,4 5
4997,4998,I guess Mother Nature really hates us. Yellows...,3 5 9 10
4998,4999,So question for the day that isnt related to c...,9 10


In [4]:
# applying the splitting in the labels column for further use
traindf['Labels'] = traindf['Labels'].apply(lambda x: [int(i) for i in x.split()] )

In [5]:
# defining the definition of each labels in the dictionary format
# this dict we are going to use for creating the extra columns for training over model
class_map = {
    "optimistic": 0,
    "thankful": 1,
    "empathetic": 2,
    "pessimistic": 3,
    "anxious": 4,
    "sad": 5,
    "annoyed": 6,
    "denial": 7,
    "surprise": 8,
    "official_report": 9,
    "joking": 10
}

In [6]:
# this function we are using for creating/ adding the columns and populating them on the basis of the labels
for k,v in class_map.items():
    traindf[k]=traindf['Labels'].apply(lambda x: 1 if v  in x else 0)
    traindf[k]=traindf[k].astype(float) # converting the numbers into float

In [7]:
traindf # new dataframe looks like this

Unnamed: 0,ID,Tweet,Labels,optimistic,thankful,empathetic,pessimistic,anxious,sad,annoyed,denial,surprise,official_report,joking
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,"[0, 10]",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,BanMediaHouse whose is responsible for spreadi...,[6],0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,Im waiting for someone to say to me that all t...,"[3, 4]",0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,He is a liar. Proven day night. Time again. Li...,[6],0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",[8],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Life Insurance? I wonder if policies are payin...,"[4, 5, 7]",0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
4996,4997,My cousin passed away from the corona virus to...,"[4, 5]",0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4997,4998,I guess Mother Nature really hates us. Yellows...,"[3, 5, 9, 10]",0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4998,4999,So question for the day that isnt related to c...,"[9, 10]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [8]:
1 in [1,2]

True

In [9]:
# defining a function for cleaning the tweets removing some specific words and punctuations
def  clean_text(text):
    text =  text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    
    return text
traindf['Tweet'] = traindf['Tweet'].apply(lambda x:clean_text(x))

In [10]:
# now using neattext for removing the stopwards from the tweets which are creating noise in the data

traindf['Tweet'].apply(lambda x:nt.TextFrame(x).noise_scan())
traindf['Tweet'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())
traindf['Tweet'].apply(nfx.remove_stopwords)
traindf['Tweet']= traindf['Tweet'].apply(nfx.remove_stopwords)

In [11]:
traindf.isna().sum() #checking for null values in the data

ID                 0
Tweet              0
Labels             0
optimistic         0
thankful           0
empathetic         0
pessimistic        0
anxious            0
sad                0
annoyed            0
denial             0
surprise           0
official_report    0
joking             0
dtype: int64

In [12]:
# Dividing the data into input and output variables
X =  traindf.Tweet
y =  traindf.drop(['ID','Labels','Tweet'],axis = 1)

In [13]:
# splitting the data into train and validation 
X_train,X_valid, y_train,y_valid= train_test_split(X,y,test_size=0.2, random_state=1)

In [14]:
X_train

1233    today babys birthday corona virus celebrate co...
1056    coronavirus created schools stop april fools d...
1686    mr president lie seriousness coronavirus threa...
187     wild pandemic big april fools day prank aprilf...
3840    thailand confirms new coronavirus cases deaths...
                              ...                        
2895            let know keeps coronavirus away worth try
2763    tory lanes french battle infected corona messy...
905     tomorrow day millions contract coronavirus rea...
3980                wanna beat corona ass beach w friends
235     breaking tablighi jamat donates coronavirus pa...
Name: Tweet, Length: 4000, dtype: object

In [15]:
y_train

Unnamed: 0,optimistic,thankful,empathetic,pessimistic,anxious,sad,annoyed,denial,surprise,official_report,joking
1233,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1686,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
187,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2895,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2763,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
905,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3980,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
# defining the word vectorizier for converting the tweets into vectors
word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',     
    analyzer='word',            
    token_pattern=r'\w{1,}',    
    ngram_range=(1, 3),         
    stop_words='english',
    sublinear_tf=True)

word_vectorizer.fit(X_train)    
train_word_features = word_vectorizer.transform(X_train)
X_train_transformed = word_vectorizer.transform(X_train)
X_valid_transformed = word_vectorizer.transform(X_valid)

In [17]:
#Saving vectorizer

pickle.dump(word_vectorizer, open("vectorizer.pkl", "wb"))

In [18]:

# Testing phase
tf1 = pickle.load(open("vectorizer.pkl", 'rb'))

# Create new tfidfVectorizer with old vocabulary

word_vectorizer1 = TfidfVectorizer(
    strip_accents='unicode',     
    analyzer='word',            
    token_pattern=r'\w{1,}',    
    ngram_range=(1, 3),         
    stop_words='english',
    sublinear_tf=True,
    vocabulary = tf1.vocabulary_)
X_valid_transformed = word_vectorizer1.fit_transform(X_valid)

In [19]:
print(X_valid_transformed)

  (0, 52149)	0.29072600101391444
  (0, 50525)	0.6218691546594884
  (0, 47791)	0.36728594052516356
  (0, 15422)	0.34664711953282834
  (0, 8876)	0.3206452801726044
  (0, 8701)	0.0943876144226659
  (0, 6327)	0.34664711953282834
  (0, 6106)	0.20446837480625968
  (1, 50786)	0.3579064045278051
  (1, 48878)	0.2896867468650635
  (1, 36457)	0.29551441584492866
  (1, 29381)	0.48327211945119364
  (1, 15371)	0.48327211945119364
  (1, 15370)	0.48327211945119364
  (2, 56651)	0.3079049134512218
  (2, 56631)	0.27832688835516683
  (2, 50810)	0.2906028779246404
  (2, 50786)	0.228031239697661
  (2, 49592)	0.23922686598940832
  (2, 39295)	0.3079049134512218
  (2, 39037)	0.2906028779246404
  (2, 39007)	0.26102485282858545
  (2, 34754)	0.3079049134512218
  (2, 34744)	0.27832688835516683
  (2, 34575)	0.16806236359399995
  :	:
  (998, 26172)	0.16944220890333828
  (998, 24169)	0.16944220890333828
  (998, 24168)	0.16944220890333828
  (998, 24167)	0.16944220890333828
  (998, 10451)	0.16944220890333828
  (998, 89

In [20]:
y_train

Unnamed: 0,optimistic,thankful,empathetic,pessimistic,anxious,sad,annoyed,denial,surprise,official_report,joking
1233,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1686,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
187,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2895,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2763,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
905,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3980,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
# defining the classifier here we are using SGDclassifier 
# Stochastic Gradient Descent (SGD) is a simple yet efficient optimization algorithm used to find the values of parameters/coefficients of  functions that minimize a cost function.
# OneVsRestClassifier is the heuristic method for using binary classification algorithms for multi-class classification

classifier = OneVsRestClassifier(SGDClassifier(random_state=0,loss='log',alpha=0.00001,penalty='elasticnet'))
classifier.fit(X_train_transformed, y_train.values)


y_train_pred_proba = classifier.predict_proba(X_train_transformed)
y_valid_pred_proba = classifier.predict_proba(X_valid_transformed)


roc_auc_score_train = roc_auc_score(y_train, y_train_pred_proba,average='weighted')
roc_auc_score_test = roc_auc_score(y_valid, y_valid_pred_proba,average='weighted')

print("ROC AUC Score Train:", roc_auc_score_train)
print("ROC AUC Score Test:", roc_auc_score_test)

y_train_pred_proba = classifier.predict_proba(X_train_transformed)
y_valid_pred_proba = classifier.predict_proba(X_valid_transformed)


roc_auc_score_train = roc_auc_score(y_train, y_train_pred_proba,average='weighted')
roc_auc_score_test = roc_auc_score(y_valid, y_valid_pred_proba,average='weighted')

print("ROC AUC Score Train:", roc_auc_score_train)
print("ROC AUC Score Test:", roc_auc_score_test)

ROC AUC Score Train: 0.9999559543477646
ROC AUC Score Test: 0.7249252332202116
ROC AUC Score Train: 0.9999559543477646
ROC AUC Score Test: 0.7249252332202116


In [22]:
# save the model to disk
pickle.dump(classifier, open("model.sav", 'wb'))

 
# load the model from disk
loaded_model = pickle.load(open("model.sav", 'rb'))

# Testing for converting it into flask model

In [23]:
y_valid_pred_proba[1]

array([0.08643883, 0.06702587, 0.00199414, 0.16072289, 0.01187621,
       0.00663967, 0.62903415, 0.44444518, 0.02572794, 0.62336515,
       0.09330877])

In [24]:
[k for k,v in dict(zip(class_map.keys(),  y_valid_pred_proba[1] )).items() if v >=0.5]


['annoyed', 'official_report']

In [25]:
loaded_model.predict_proba(X_valid_transformed[1])

array([[0.08643883, 0.06702587, 0.00199414, 0.16072289, 0.01187621,
        0.00663967, 0.62903415, 0.44444518, 0.02572794, 0.62336515,
        0.09330877]])

In [26]:
class_map.keys()

dict_keys(['optimistic', 'thankful', 'empathetic', 'pessimistic', 'anxious', 'sad', 'annoyed', 'denial', 'surprise', 'official_report', 'joking'])

In [27]:
y_valid_pred_proba[1]

array([0.08643883, 0.06702587, 0.00199414, 0.16072289, 0.01187621,
       0.00663967, 0.62903415, 0.44444518, 0.02572794, 0.62336515,
       0.09330877])

In [28]:
texts="Fact:- The words Corona and Dalgona rhymes perfectly. Maybe this is the reason behind the trend dalgonaCoffee"

In [29]:

# importing the vectorizer
tf1 = pickle.load(open("vectorizer.pkl", 'rb'))
# importing the model
loaded_model = pickle.load(open("model.sav", 'rb'))

# defining a function for cleaning the tweets removing some specific words and punctuations
def  clean_text(text):
    text =  text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    
    return text

# defining the definition of each labels in the dictionary format
# this dict we are going to use for creating the extra columns for training over model
class_map = {
    "optimistic": 0,
    "thankful": 1,
    "empathetic": 2,
    "pessimistic": 3,
    "anxious": 4,
    "sad": 5,
    "annoyed": 6,
    "denial": 7,
    "surprise": 8,
    "official_report": 9,
    "joking": 10}

word_vectorizer1 = TfidfVectorizer(
            strip_accents='unicode',     
            analyzer='word',            
            token_pattern=r'\w{1,}',    
            ngram_range=(1, 3),         
            stop_words='english',
            sublinear_tf=True,
            vocabulary = tf1.vocabulary_)


def submit(texts):
    texts = clean_text(texts)
    text_trans = word_vectorizer1.fit_transform([texts])
    y_pred = loaded_model.predict_proba(text_trans)
    predictions = [k for k,v in dict(zip(class_map.keys(),  y_pred[0] )).items() if v >=0.5]
    try:
        predictions = [k for k,v in dict(zip(class_map.keys(),  y_pred[0] )).items() if v >=0.5]
    except:
        predictions = "No labels are associated with it"

    return predictions

print(submit(texts))

['official_report', 'joking']
