# **BUILDING THE CLASSIFIER MODEL BY APPLYING NLP**
## Steps(from scraping data to building model)
*   Scrape reviews from a google playstore apps
*   Text Preprocessing
*   Text clustering
*   Fine tune a pretrained XLNet model
*   Generate Predictions

# Scrape Google Playstore reviews using google-play-scraper and MongoDB







In [None]:
import pandas as pd

# for scraping app info and reviews from Google Play
from google_play_scraper import app, Sort, reviews

# for pretty printing data structures
from pprint import pprint

# for storing in MongoDB
import pymongo
from pymongo import MongoClient

# for keeping track of timing
import datetime as dt
from tzlocal import get_localzone

# for building in wait times
import random
import time

In [None]:
# Set up Mongo client
client = MongoClient(host='localhost', port=27017)

# Database for project
app_proj_db = client['app_proj_db']

# Set up new collection within project db for app reviews
review_collection = app_proj_db['review_collection']

In [None]:
# choosing some random app names along with their app ids
app_names = ['SBI',
'UNION',
'CBOI',
'BOI',
'CORP',
'IOB',
'LVB',
'KVB']
app_ids =  [
  'com.freedomrewardz',
'com.unionrewardz',
'com.centrewardz',
'com.boistarrewardz',
'com.corprewardz',
'com.iobrewardz',
'com.lvbrewardz',
'com.kvbrewardz'
]

In [None]:
#Looping through app_ids and app_names to get reviews
for app_name, app_id in zip(app_names, app_ids):
    
    # Get the starting time
    start = dt.datetime.now(tz=get_localzone())
    fmt= "%m/%d/%y - %T %p"    
    print('-------------------------------------------------------------------------')    
    print(f'{app_name} started at {start.strftime(fmt)}')
    print()
    
    # Empty list for storing reviews
    app_reviews = []
    
    # Number of reviews to scrape per batch
    count = 200
    
    # To keep track of how many batches have been completed
    batch_num = 0
    
    
    # Retrieve reviews (and continuation_token) with reviews function
    rvws, token = reviews(
        app_id,           # found in app's url
        lang='en',        # defaults to 'en'
        country='us',     # defaults to 'us'
        sort=Sort.NEWEST, # start with most recent
        count=count       # batch size
    )
    
    
    # For each review get an app name and app id
    for r in rvws:
        r['app_name'] = app_name 
        r['app_id'] = app_id     
     
    
    # Add the list of review dicts to overall list
    app_reviews.extend(rvws)
    
    # Increase batch count by one
    batch_num +=1 
    print(f'Batch {batch_num} completed.')
    
    # Wait 1 to 5 seconds to start next batch
    time.sleep(random.randint(1,5))
    
    
    
    # Append review IDs
    pre_review_ids = []
    for rvw in app_reviews:
        pre_review_ids.append(rvw['reviewId'])
    
    
    # Loop through at most max number of batches
    for batch in range(4999):
        rvws, token = reviews( 
            app_id,
            lang='en',
            country='us',
            sort=Sort.NEWEST,
            count=count,
            # using token obtained from previous batch
            continuation_token=token
        )
        
        # Append unique review IDs from current batch to new list
        new_review_ids = []
        for r in rvws:
            new_review_ids.append(r['reviewId'])
            
            # And add keys for name and id to the review dict
            r['app_name'] = app_name 
            r['app_id'] = app_id     
     
        # Add the list of review dicts to app_reviews list
        app_reviews.extend(rvws)
        
        # Increase batch count by one
        batch_num +=1
        
        # Break loop and stop scraping for current app if most recent batch
        # did not add any unique reviews
        all_review_ids = pre_review_ids + new_review_ids
        if len(set(pre_review_ids)) == len(set(all_review_ids)):
            print(f'No reviews left to scrape. Completed {batch_num} batches.\n')
            break
        
        # all_review_ids becomes pre_review_ids to check against 
        # for next batch
        pre_review_ids = all_review_ids
        
        
        # At every 100th batch
        if batch_num%100==0:
            
            # print update on number of batches
            print(f'Batch {batch_num} completed.')
            
            # insert reviews into collection
            review_collection.insert_many(app_reviews)
            
            # print update about num reviews inserted
            store_time = dt.datetime.now(tz=get_localzone())
            print(f"""
            Successfully inserted {len(app_reviews)} {app_name} 
            reviews into collection at {store_time.strftime(fmt)}.\n
            """)
            
            # empty our list for next round of 100 batches
            app_reviews = []
        
        # Wait 1 to 5 seconds to start next batch
        time.sleep(random.randint(1,5))
      
    
    # Print update when max number of batches has been reached
    # OR when last batch didn't add any unique reviews
    print(f'Done scraping {app_name}.')
    print(f'Scraped a total of {len(set(pre_review_ids))} unique reviews.\n')
    
    
    # Insert remaining reviews into collection
    review_collection.insert_many(app_reviews)
    
    # Get end time
    end = dt.datetime.now(tz=get_localzone())
    
    # Print ending output for app
    print(f"""
    Successfully inserted all {app_name} reviews into collection
    at {end.strftime(fmt)}.\n
    """)
    print(f'Time taken to scrape reviews for {app_name}: {end-start}')
    print('-----------------------------------------------------------------------')
    print('\n')
    
    # Wait 1 to 5 seconds to start scraping next app
    time.sleep(random.randint(1,5))

In [None]:
# converting the results into dataframe
app_reviews_df = pd.DataFrame(list(review_collection.find({})))

In [None]:
#splitting the 'at' column into date and time separately

# make string version of original column, call it 'col'
app_reviews_df['col'] = app_reviews_df['at'].astype(str)

# make the new columns using string indexing
app_reviews_df['date'] = app_reviews_df['col'].str[0:11]
app_reviews_df['time'] = app_reviews_df['col'].str[11:20]

# get rid of the extra variable
app_reviews_df.drop('col', axis=1, inplace=True)
app_reviews_df.drop('at', axis=1, inplace=True)

In [None]:
app_reviews_df['date'] = pd.to_datetime(app_reviews_df['date'])
  
start_date = '01-01-2019'
end_date = '03-31-2021'

mask = (app_reviews_df['date'] > start_date) & (app_reviews_df['date'] <= end_date)
app_reviews_df = app_reviews_df.loc[mask]

del app_reviews_df['repliedAt']

app_reviews_df.head()

In [None]:
# you might get some duplicates. To remove duplicates:
app_reviews_df = app_reviews_df.drop_duplicates(subset=['reviewId'])

In [None]:
#convert the final dataframe into csv
app_reviews_df.to_csv('sbi_reviews_final.csv', index=None, header=True)

# Text Preprocessing

In [None]:
#load the data
sample=pd.read_csv("sbi_reviews_final.csv")

In [None]:
# find sentences containing HTML tags
import re
i=0;
for sent in sample['content'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#initialising the snowball stemmer which is developed in recent years
sno = nltk.stem.SnowballStemmer('english') 
stop=set(stopwords.words('english'))


#function to clean the word of any html-tags
def cleanhtml(sentence): 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
#function to clean the word of any punctuation or special characters
def cleanpunc(sentence): 
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [None]:
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in sample['content'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (sample['Sentiment'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(sample['Sentiment'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [None]:
#adding a column of CleanedText which displays the data after pre-processing of the review
sample['CleanedText']=final_string  
sample['CleanedText']=sample['CleanedText'].str.decode("utf-8")

# Text Clustering

Text clustering is done using tf-idf and k-means clustering and then the final clusteres are formed using n-grams method. In this n-grams technique, you have to make n-grams for each cluster and pick only those words which belong to that particular category, basically, the words which are more frequent in that cluster.

Any other method can be used for text clustering depending on different datasets and any desired number of categories can be formed for that dataset.

Here, seven categories were made after clustering and then stored into a csv called 'clustered_data.csv' which is provided in the repository for direct use. It shows the format in which the clustered data was made by doing some data wrangling.


# Building the Classifier Model
A pretrained **XLNet model** has been used for classification and then this pretrained XLNet model was fine tuned by training it on the clustered data. And, after preprocessing the data and training the model on this data, a function was formed for getting predictions for a new data along with the probabilities of each label.

In [None]:
# import libraries
import numpy as np
import pandas as pd
import os
import math
import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig
from transformers import XLNetTokenizerFast
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

## Import Dataset

In [None]:
df = pd.read_csv('clustered_data.csv')
df.head()

Unnamed: 0,reviewId,content,Problem in recharge,Problem in reward/redeem points,Problem in registration/login/username/password,Problem with customer care service,Other complaints,Bad comments,Appreciation
0,gp:AOqpTOGke5oHGTtgbNCNSurTU8c4h9j0aJgkGSKqvlO...,"for mobile recharge, this is very excellent app.",0,0,0,0,0,0,1
1,gp:AOqpTOFGUaCTHYLrzFzaAusdm-wc1Rm9cUyHjgzzuwY...,awesome app for recharge and collect point any...,0,0,0,0,0,0,1
2,gp:AOqpTOFldCmXCDu4Ji5GGBZheX1k057zvjenXeydSCg...,smooth and trouble free recharge.,0,0,0,0,0,0,1
3,gp:AOqpTOHZne9EjZEQd8AWRd4rKr-Jmzk_nCQPb2wOqJp...,very nice app for revard and to use it to rech...,0,0,0,0,0,0,1
4,gp:AOqpTOHlQooz_z4BGtqsPAlx99zdOvJOt6Nj3OBCkEA...,good for mobile recharge,0,0,0,0,0,0,1


In [None]:
# Split the data:
train, test = train_test_split(df, test_size=0.05)
train.shape, test.shape

((14747, 9), (777, 9))

In [None]:
train = train.set_index('reviewId')
train.head()

Unnamed: 0_level_0,content,Problem in recharge,Problem in reward/redeem points,Problem in registration/login/username/password,Problem with customer care service,Other complaints,Bad comments,Appreciation
reviewId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gp:AOqpTOH8jCaAwdk9-qE_MmxJ97tAXE30k37mUBiYpR7cmyyDyw0HMuUrD-URMKP3q-9kr3LXDshnFqO-hgDFRyA,"waste of using this app , could not activate m...",0,0,0,0,0,1,0
gp:AOqpTOEjgWcte_Y0Afg-qIeTJ-rUJnc_r__g3_6OjfGBqcGCT-1xrVtGzQV7YnxxsY8gel4ZJvGAKR3buKSrCU0,good,0,0,0,0,0,0,1
gp:AOqpTOGniFIBHNn4xutfeb3zYgfS1_nomysuhv6iXaa0LLb_qq9bGJdWUQrn4J1xlSJY050ED2c9GirVxiOSROE,not good,0,0,0,0,0,1,0
gp:AOqpTOGQoTMiP_dpiv7tnjkvRfkqQDeR5_Ffcz7vqNMlz59LaZ0AJjltwYf8TfFnBvW2iAutxdozt7F8u3Va7UU,mobile recharge is not done while reward point...,1,1,0,0,0,0,0
gp:AOqpTOHJsvpGEYcklDS7kYRCSjFeO7dWTCRLSvFHxOVsUUgcvzanxIvIpdRJGEsKw8axauDb_2y2s8BpnhiziR0,"worst experience, terrible performance by the ...",0,0,0,0,0,1,0


In [None]:
test = test[['reviewId','content']]
test = test.set_index('reviewId')
test.head()

Unnamed: 0_level_0,content
reviewId,Unnamed: 1_level_1
gp:AOqpTOGTADrOKHFXYgD5jwLGK1Ult0gcnjTCj3fqrKk4AEjJiPK90jg-o56Vf6X_Mtfbcibv6dEw3b5IqXR5mz0,good boy
gp:AOqpTOHG20z1AXXSPa5cRQQXNS3Mri57rOo9JNu0MZBZxnx8wgNl6oecPTvUEDZBJ1ix3ovWeYvsMgNUlNfdMJc,very nice
gp:AOqpTOEjMUAO5sa_j77QG0hp75avoD2FwOjevIlNWcXhLDyl6RZHpKyuO15V_kambTEUfczaPvSeQKTdYwfwVNI,bad app
gp:AOqpTOHxMcsJGLebsXniRuZk71y546Y-tCV0ME-gaZby9COr_vTk1c6Axe-T2jeejxwKsEaAHbbs7onwe3EQLPU,worst app
gp:AOqpTOFGo_lltI1X7gImeKrUe7tnKJT0Nst6IIujSWrg9exW6eF5BA-b50g5XRXnVJBlbuyTSNhoOyonPZ-4Yzo,i have successfully registered but when i try ...


In [None]:
train.shape

(14747, 8)

In [None]:
test.shape

(777, 1)

# Preprocess Data

In [None]:
# tokenize data
tokenizer = XLNetTokenizerFast.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [None]:
train_text_list = train["content"].values
test_text_list = test["content"].values

In [None]:
def tokenize_inputs(text_list, tokenizer, num_embeddings=512):
    """
    Tokenizes the input text input into ids. Appends the appropriate special
    characters to the end of the text to denote end of sentence. Truncate or pad
    the appropriate sequence length.
    """
    # tokenize the text, then truncate sequence to the desired length minus 2 for
    # the 2 special characters
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:num_embeddings-2], text_list))
    # convert tokenized text into numeric ids for the appropriate LM
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # append special token "<s>" and </s> to end of sentence
    input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
    # pad sequences
    input_ids = pad_sequences(input_ids, maxlen=num_embeddings, dtype="long", truncating="post", padding="post")
    return input_ids

def create_attn_masks(input_ids):
    """
    Create attention masks to tell model whether attention should be applied to
    the input id tokens. Do not want to perform attention on padding tokens.
    """
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

In [None]:
# create input id tokens
train_input_ids = tokenize_inputs(train_text_list, tokenizer, num_embeddings=250)
train_input_ids

array([[3419,   20,  381, ...,    0,    0,    0],
       [ 195,    4,    3, ...,    0,    0,    0],
       [  50,  195,    4, ...,    0,    0,    0],
       ...,
       [  36,   26,   23, ...,    0,    0,    0],
       [  36,  172, 2101, ...,    0,    0,    0],
       [  50,  195,    4, ...,    0,    0,    0]])

In [None]:
# create input id tokens
test_input_ids = tokenize_inputs(test_text_list, tokenizer, num_embeddings=250)
test_input_ids

array([[ 195, 2001,    4, ...,    0,    0,    0],
       [ 172, 2101,    4, ...,    0,    0,    0],
       [ 948, 5523,    4, ...,    0,    0,    0],
       ...,
       [  17,  150,   17, ...,    0,    0,    0],
       [ 312, 5523,   21, ...,    0,    0,    0],
       [ 195,    4,    3, ...,    0,    0,    0]])

In [None]:
# create attention masks
train_attention_masks = create_attn_masks(train_input_ids)
# train_attention_masks

In [None]:
# create attention masks
test_attention_masks = create_attn_masks(test_input_ids)
# test_attention_masks

In [None]:
# add input ids and attention masks to the dataframe
train["features"] = train_input_ids.tolist()
train["masks"] = train_attention_masks

test["features"] = test_input_ids.tolist()
test["masks"] = test_attention_masks

In [None]:
train.head()

Unnamed: 0_level_0,content,Problem in recharge,Problem in reward/redeem points,Problem in registration/login/username/password,Problem with customer care service,Other complaints,Bad comments,Appreciation,features,masks
reviewId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
gp:AOqpTOH8jCaAwdk9-qE_MmxJ97tAXE30k37mUBiYpR7cmyyDyw0HMuUrD-URMKP3q-9kr3LXDshnFqO-hgDFRyA,"waste of using this app , could not activate m...",0,0,0,0,0,1,0,"[3419, 20, 381, 52, 5523, 17, 19, 121, 50, 177...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
gp:AOqpTOEjgWcte_Y0Afg-qIeTJ-rUJnc_r__g3_6OjfGBqcGCT-1xrVtGzQV7YnxxsY8gel4ZJvGAKR3buKSrCU0,good,0,0,0,0,0,0,1,"[195, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
gp:AOqpTOGniFIBHNn4xutfeb3zYgfS1_nomysuhv6iXaa0LLb_qq9bGJdWUQrn4J1xlSJY050ED2c9GirVxiOSROE,not good,0,0,0,0,0,1,0,"[50, 195, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
gp:AOqpTOGQoTMiP_dpiv7tnjkvRfkqQDeR5_Ffcz7vqNMlz59LaZ0AJjltwYf8TfFnBvW2iAutxdozt7F8u3Va7UU,mobile recharge is not done while reward point...,1,1,0,0,0,0,0,"[2487, 23140, 27, 50, 588, 171, 8614, 424, 186...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
gp:AOqpTOHJsvpGEYcklDS7kYRCSjFeO7dWTCRLSvFHxOVsUUgcvzanxIvIpdRJGEsKw8axauDb_2y2s8BpnhiziR0,"worst experience, terrible performance by the ...",0,0,0,0,0,1,0,"[2598, 656, 19, 6518, 922, 37, 18, 5523, 9, 4,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [None]:
test.head()

Unnamed: 0_level_0,content,features,masks
reviewId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gp:AOqpTOGTADrOKHFXYgD5jwLGK1Ult0gcnjTCj3fqrKk4AEjJiPK90jg-o56Vf6X_Mtfbcibv6dEw3b5IqXR5mz0,good boy,"[195, 2001, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
gp:AOqpTOHG20z1AXXSPa5cRQQXNS3Mri57rOo9JNu0MZBZxnx8wgNl6oecPTvUEDZBJ1ix3ovWeYvsMgNUlNfdMJc,very nice,"[172, 2101, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
gp:AOqpTOEjMUAO5sa_j77QG0hp75avoD2FwOjevIlNWcXhLDyl6RZHpKyuO15V_kambTEUfczaPvSeQKTdYwfwVNI,bad app,"[948, 5523, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
gp:AOqpTOHxMcsJGLebsXniRuZk71y546Y-tCV0ME-gaZby9COr_vTk1c6Axe-T2jeejxwKsEaAHbbs7onwe3EQLPU,worst app,"[2598, 5523, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
gp:AOqpTOFGo_lltI1X7gImeKrUe7tnKJT0Nst6IIujSWrg9exW6eF5BA-b50g5XRXnVJBlbuyTSNhoOyonPZ-4Yzo,i have successfully registered but when i try ...,"[17, 150, 47, 3918, 2815, 57, 90, 17, 150, 714...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


# Train, Valid Split

In [None]:
# split into train and valid
train, valid = train_test_split(train, test_size=0.2, random_state=42)

In [None]:
X_train = train["features"].values.tolist()
X_valid = valid["features"].values.tolist()

train_masks = train["masks"].values.tolist()
valid_masks = valid["masks"].values.tolist()

label_cols = ['Problem in recharge','Problem in reward/redeem points','Problem in registration/login/username/password','Problem with customer care service','Other complaints','Bad comments','Appreciation']
Y_train = train[label_cols].values.tolist()
Y_valid = valid[label_cols].values.tolist()

# Create Dataloaders

In [None]:
# create dataloaders
# Convert all of our input ids and attention masks into 
# torch tensors, the required datatype for our model

X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)

Y_train = torch.tensor(Y_train, dtype=torch.float32)
Y_valid = torch.tensor(Y_valid, dtype=torch.float32)

train_masks = torch.tensor(train_masks, dtype=torch.long)
valid_masks = torch.tensor(valid_masks, dtype=torch.long)

In [None]:
# Batch size for training
batch_size = 16

# Create an iterator of our data with torch DataLoader.
train_data = TensorDataset(X_train, train_masks, Y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,\
                              sampler=train_sampler,\
                              batch_size=batch_size)

validation_data = TensorDataset(X_valid, valid_masks, Y_valid)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,\
                                   sampler=validation_sampler,\
                                   batch_size=batch_size)

In [None]:
def train(model, num_epochs,\
          optimizer,\
          train_dataloader, valid_dataloader,\
          model_save_path,\
          train_loss_set=[], valid_loss_set = [],\
          lowest_eval_loss=None, start_epoch=0,\
          device="cpu"
          ):
  """
  Train the model and save the model with the lowest validation loss
  """

  model.to(device)

  # trange is a tqdm wrapper around the normal python range
  for i in trange(num_epochs, desc="Epoch"):
    # if continue training from saved model
    actual_epoch = start_epoch + i

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0
    num_train_samples = 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Clear out the gradients (by default they accumulate)
      optimizer.zero_grad()
      # Forward pass
      loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
      # store train loss
      tr_loss += loss.item()
      num_train_samples += b_labels.size(0)
      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient
      optimizer.step()
      #scheduler.step()

    # Update tracking variables
    epoch_train_loss = tr_loss/num_train_samples
    train_loss_set.append(epoch_train_loss)

    print("Train loss: {}".format(epoch_train_loss))

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss = 0
    num_eval_samples = 0

    # Evaluate data for one epoch
    for batch in valid_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Telling the model not to compute or store gradients,
      # saving memory and speeding up validation
      with torch.no_grad():
        # Forward pass, calculate validation loss
        loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        # store valid loss
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)

    epoch_eval_loss = eval_loss/num_eval_samples
    valid_loss_set.append(epoch_eval_loss)

    print("Valid loss: {}".format(epoch_eval_loss))

    if lowest_eval_loss == None:
      lowest_eval_loss = epoch_eval_loss
      # save model
      save_model(model, model_save_path, actual_epoch,\
                 lowest_eval_loss, train_loss_set, valid_loss_set)
    else:
      if epoch_eval_loss < lowest_eval_loss:
        lowest_eval_loss = epoch_eval_loss
        # save model
        save_model(model, model_save_path, actual_epoch,\
                   lowest_eval_loss, train_loss_set, valid_loss_set)
    print("\n")

  return model, train_loss_set, valid_loss_set


def save_model(model, save_path, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist):
  """
  Save the model to the path directory provided
  """
  model_to_save = model.module if hasattr(model, 'module') else model
  checkpoint = {'epochs': epochs, \
                'lowest_eval_loss': lowest_eval_loss,\
                'state_dict': model_to_save.state_dict(),\
                'train_loss_hist': train_loss_hist,\
                'valid_loss_hist': valid_loss_hist
               }
  torch.save(checkpoint, save_path)
  print("Saving model at epoch {} with validation loss of {}".format(epochs,\
                                                                     lowest_eval_loss))
  return
  
def load_model(save_path):
  """
  Load the model from the path directory provided
  """
  checkpoint = torch.load(save_path)
  model_state_dict = checkpoint['state_dict']
  model = XLNetForMultiLabelSequenceClassification(num_labels=model_state_dict["classifier.weight"].size()[0])
  model.load_state_dict(model_state_dict)

  epochs = checkpoint["epochs"]
  lowest_eval_loss = checkpoint["lowest_eval_loss"]
  train_loss_hist = checkpoint["train_loss_hist"]
  valid_loss_hist = checkpoint["valid_loss_hist"]
  
  return model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist

# Train Model from Scratch

In [None]:
# train model from scratch

torch.cuda.empty_cache()

In [None]:
#config = XLNetConfig()
        
class XLNetForMultiLabelSequenceClassification(torch.nn.Module):
  
  def __init__(self, num_labels=2):
    super(XLNetForMultiLabelSequenceClassification, self).__init__()
    self.num_labels = num_labels
    self.xlnet = XLNetModel.from_pretrained('xlnet-base-cased')
    self.classifier = torch.nn.Linear(768, num_labels)

    torch.nn.init.xavier_normal_(self.classifier.weight)

  def forward(self, input_ids, token_type_ids=None,\
              attention_mask=None, labels=None):
    # last hidden layer
    last_hidden_state = self.xlnet(input_ids=input_ids,\
                                   attention_mask=attention_mask,\
                                   token_type_ids=token_type_ids)
    # pool the outputs into a mean vector
    mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
    logits = self.classifier(mean_last_hidden_state)
        
    if labels is not None:
      loss_fct = BCEWithLogitsLoss()
      loss = loss_fct(logits.view(-1, self.num_labels),\
                      labels.view(-1, self.num_labels))
      return loss
    else:
      return logits
    
  def freeze_xlnet_decoder(self):
    """
    Freeze XLNet weight parameters. They will not be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = False
    
  def unfreeze_xlnet_decoder(self):
    """
    Unfreeze XLNet weight parameters. They will be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = True
    
  def pool_hidden_state(self, last_hidden_state):
    """
    Pool the output vectors into a single mean vector 
    """
    last_hidden_state = last_hidden_state[0]
    mean_last_hidden_state = torch.mean(last_hidden_state, 1)
    return mean_last_hidden_state
    
model = XLNetForMultiLabelSequenceClassification(num_labels=len(Y_train[0]))

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False)

In [None]:
# import drive in colab to get a model save path
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# training..
num_epochs=3

model_save_name = 'classifier_model1.pt'
model_save_path = F"/content/gdrive/My Drive/{model_save_name}" 
model, train_loss_set, valid_loss_set = train(model=model,\
                                              num_epochs=num_epochs,\
                                              optimizer=optimizer,\
                                              train_dataloader=train_dataloader,\
                                              valid_dataloader=validation_dataloader,\
                                              model_save_path=model_save_path,\
                                              device="cuda")

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.0054599517184351146
Valid loss: 0.003236652824167415


Epoch:  33%|███▎      | 1/3 [15:20<30:41, 920.52s/it]

Saving model at epoch 0 with validation loss of 0.003236652824167415


Train loss: 0.0022549303461320367
Valid loss: 0.0028260064589883736


Epoch:  67%|██████▋   | 2/3 [30:38<15:19, 919.73s/it]

Saving model at epoch 1 with validation loss of 0.0028260064589883736


Train loss: 0.0014024759660813371


Epoch: 100%|██████████| 3/3 [45:54<00:00, 918.25s/it]

Valid loss: 0.003301545290242618







# Train Model From Previous Checkpoint

In [None]:
# train model from previous checkpoint
model_save_name = 'classifier_model1.pt'
model_save_path = F"/content/gdrive/My Drive/{model_save_name}"
model, start_epoch, lowest_eval_loss, train_loss_hist, valid_loss_hist = load_model(model_save_path)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False)

In [None]:
num_epochs=3
model, train_loss_set, valid_loss_set = train(model=model,\
                                              num_epochs=num_epochs,\
                                              optimizer=optimizer,\
                                              train_dataloader=train_dataloader,\
                                              valid_dataloader=validation_dataloader,\
                                              model_save_path=model_save_path,\
                                              train_loss_set=train_loss_hist,\
                                              valid_loss_set=valid_loss_hist,\
                                              lowest_eval_loss=lowest_eval_loss,\
                                              start_epoch=start_epoch,\
                                              device="cuda")

In [None]:
# save the model
torch. save(model. state_dict(), model_save_path)

# Get Predictions

In [None]:
# get predictions
def generate_predictions(model, df, num_labels, device="cpu", batch_size=32):
  num_iter = math.ceil(df.shape[0]/batch_size)
  
  pred_probs = np.array([]).reshape(0, num_labels)
  
  model.to(device)
  model.eval()
  
  for i in range(num_iter):
    df_subset = df.iloc[i*batch_size:(i+1)*batch_size,:]
    X = df_subset["features"].values.tolist()
    masks = df_subset["masks"].values.tolist()
    X = torch.tensor(X)
    masks = torch.tensor(masks, dtype=torch.long)
    X = X.to(device)
    masks = masks.to(device)
    with torch.no_grad():
      logits = model(input_ids=X, attention_mask=masks)
      logits = logits.sigmoid().detach().cpu().numpy()
      pred_probs = np.vstack([pred_probs, logits])
  
  return pred_probs

In [None]:
num_labels = len(label_cols)
pred_probs = generate_predictions(model, test, num_labels, device="cuda", batch_size=32)
pred_probs = np.round(pred_probs, 3)
# pred_probs

array([[3.26418785e-07, 7.21927762e-08, 5.00244113e-08, ...,
        4.55935492e-07, 7.14609314e-08, 1.00000000e+00],
       [3.10080992e-07, 6.26785166e-08, 5.39943521e-08, ...,
        3.66178853e-07, 5.96945142e-08, 1.00000000e+00],
       [5.12417091e-06, 8.35645210e-07, 1.46626360e-06, ...,
        6.24060294e-06, 9.99872923e-01, 8.27477925e-05],
       ...,
       [1.15612900e-04, 8.55474151e-04, 1.63673230e-05, ...,
        1.95052169e-04, 1.19782962e-05, 9.99762237e-01],
       [1.29337241e-06, 5.01019429e-07, 1.80744493e-07, ...,
        1.65683559e-06, 2.84675764e-07, 9.99999762e-01],
       [2.73788032e-07, 6.32528128e-08, 4.51179254e-08, ...,
        3.32330586e-07, 5.56356845e-08, 1.00000000e+00]])

In [None]:
label_cols = ['Problem in recharge','Problem in reward/redeem points','Problem in registration/login/username/password','Problem with customer care service','Other complaints','Bad comments','Appreciation']

test['Problem in recharge'] = pred_probs[:,0]
test['Problem in reward/redeem points'] = pred_probs[:,1]
test['Problem in registration/login/username/password'] = pred_probs[:,2]
test['Problem with customer care service'] = pred_probs[:,3]
test['Other complaints'] = pred_probs[:,4]
test['Bad comments'] = pred_probs[:,5]
test['Appreciation'] = pred_probs[:,6]

In [None]:
test.head()

Unnamed: 0_level_0,content,features,masks,Problem in recharge,Problem in reward/redeem points,Problem in registration/login/username/password,Problem with customer care service,Other complaints,Bad comments,Appreciation
reviewId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
gp:AOqpTOGTADrOKHFXYgD5jwLGK1Ult0gcnjTCj3fqrKk4AEjJiPK90jg-o56Vf6X_Mtfbcibv6dEw3b5IqXR5mz0,good boy,"[195, 2001, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.264188e-07,7.219278e-08,5.002441e-08,4.08284e-09,4.559355e-07,7.146093e-08,1.0
gp:AOqpTOHG20z1AXXSPa5cRQQXNS3Mri57rOo9JNu0MZBZxnx8wgNl6oecPTvUEDZBJ1ix3ovWeYvsMgNUlNfdMJc,very nice,"[172, 2101, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.10081e-07,6.267852e-08,5.399435e-08,4.083798e-09,3.661789e-07,5.969451e-08,1.0
gp:AOqpTOEjMUAO5sa_j77QG0hp75avoD2FwOjevIlNWcXhLDyl6RZHpKyuO15V_kambTEUfczaPvSeQKTdYwfwVNI,bad app,"[948, 5523, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.124171e-06,8.356452e-07,1.466264e-06,1.589592e-07,6.240603e-06,0.9998729,8.3e-05
gp:AOqpTOHxMcsJGLebsXniRuZk71y546Y-tCV0ME-gaZby9COr_vTk1c6Axe-T2jeejxwKsEaAHbbs7onwe3EQLPU,worst app,"[2598, 5523, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.317411e-06,1.43077e-06,1.063038e-06,5.784522e-08,1.083658e-06,0.9999573,2.8e-05
gp:AOqpTOFGo_lltI1X7gImeKrUe7tnKJT0Nst6IIujSWrg9exW6eF5BA-b50g5XRXnVJBlbuyTSNhoOyonPZ-4Yzo,i have successfully registered but when i try ...,"[17, 150, 47, 3918, 2815, 57, 90, 17, 150, 714...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.001100581,0.9970652,0.9960038,0.0003026004,0.8967006,0.0001224051,0.000364


In [None]:
# test.to_csv('xlnet_classifier.csv')

# Get results for a single comment

In [None]:
comment = "recharge was bad and the customer care did not respond and the app was not working"

In [None]:
data = [[comment]]
df1 = pd.DataFrame(data, columns = ['content'])
df1_text_list = df1["content"].values
df1_input_ids = tokenize_inputs(df1_text_list, tokenizer, num_embeddings=250)
df1_attention_masks = create_attn_masks(df1_input_ids)
df1["features"] = df1_input_ids.tolist()
df1["masks"] = df1_attention_masks
num_labels = len(label_cols)
pred_probs = generate_predictions(model, df1, num_labels, device="cuda", batch_size=1)
pred_probs = np.round(pred_probs, 3)
probsList = [ item for elem in pred_probs for item in elem]
probsList        

[0.999, 0.001, 0.0, 1.0, 0.932, 0.0, 0.001]

In [None]:
THRESHOLD = 0.5
if probsList[0] < THRESHOLD and probsList[1] < THRESHOLD and probsList[2] < THRESHOLD and probsList[
    3] < THRESHOLD and probsList[4] < THRESHOLD and probsList[5] < THRESHOLD and probsList[6] < THRESHOLD:
    st.subheader("Your text does not belong to any category!")
else:
    for label, prediction in zip(label_cols, probsList):
        if prediction < THRESHOLD:
            continue
        print(f"{label}: {prediction}")

Problem in recharge: 0.999
Problem with customer care service: 1.0
Other complaints: 0.932
