In [17]:
# Importing pre-requisite Libraries.
# ------------------------------------------------------------------------------------------------------------------------------
import pandas as pd                                                     # Pandas offers data structures and operations for manipulating numerical tables and time series.
import numpy as np                                                      # NumPy offers support for large, multi-dimensional arrays and matrices, along with a large collection 
                                                                        # of high-level mathematical functions to operate on these arrays.
import pickle                                                           # Using Pickle to convert a Python object into a byte stream to store it in a file/database, 
                                                                        # maintain program state across sessions, making it easier to work with in deployments. 
from datetime import datetime

# Importing Sklearn Models and the required Libraries for ML operations.
# ------------------------------------------------------------------------------------------------------------------------------
from sklearn.linear_model import LogisticRegression                     # The logistic model is a statistical model that models the probability of one event taking place 
                                                                        # by having the log-odds for the event be a linear combination of one or more independent variables.                      
from sklearn.neighbors import KNeighborsClassifier                      # K-nearest neighbors is a supervised learning algorithm used for classification and regression tasks.
from sklearn import tree                                                # Importing the tree module from scikit-learn (sklearn) library for decision tree-based machine learning models.
from sklearn.tree import DecisionTreeClassifier                         # DecisionTreeClassifier is used for creating decision tree models, which are a type of supervised learning algorithm used for both classification and regression tasks.
from sklearn.ensemble import RandomForestClassifier                     # Random Forest Classifier is an ensemble learning method that combines multiple decision trees to create a more robust and accurate model.
from sklearn.svm import SVC                                             # The SVC class provides an implementation of the SVM algorithm for classification tasks. It can handle both linear and non-linear data by finding the optimal hyperplane or kernel function to separate classes.
from imblearn.over_sampling import SMOTE                                # The SMOTE function is used to generate synthetic examples of the minority class in imbalanced datasets. It creates synthetic examples by interpolating between instances of the minority class, effectively increasing the number of minority class samples in the dataset.

# Importing the metrics module for evaluating model performance.
# ------------------------------------------------------------------------------------------------------------------------------    
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score         
                                                                        # Create a custom scoring function using the make_scorer function for model evaluation during cross-validation.
                                                                        # Precision, recall, accuracy, and F1-score are metrics used for evaluating classification models.
                                                                        # These metrics provide insights into different aspects of model performance, such as the ability to correctly classify positive and negative samples, overall accuracy, and the trade-off between precision and recall.
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score, mean_squared_error
                                                                        # The mean_absolute_percentage_error, mean_absolute_error, r2_score, and mean_squared_error functions are commonly used metrics for evaluating regression models.
import statsmodels.api as sm                                            # The statsmodels.api module is a Python library for conducting statistical analysis, including linear regression, logistic regression, time series analysis, and more.
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, cross_val_score, KFold
                                                                        # The train_test_split function is commonly used to split the data into training and testing sets for model evaluation and validation.
                                                                        # The cross_val_score function is used for cross-validation, which is a technique for assessing the performance of a model by training and evaluating it on multiple subsets of data.
                                                                        # The RandomizedSearchCV function is used for hyperparameter tuning using randomized search, which is a technique for finding optimal hyperparameter values for a model by sampling from a distribution of possible values.
                                                                        # The KFold function is used to create K-fold cross-validation splits, where K is the number of folds. It can be used in conjunction with cross_val_score for performing K-fold cross-validation.
from sklearn.metrics import confusion_matrix, plot_confusion_matrix     # The confusion_matrix function is used to compute the confusion matrix, which is a table that describes the performance of a classification model by showing the counts of true positive, true negative, false positive, and false negative predictions.
                                                                        # The plot_confusion_matrix function is used to plot the confusion matrix for visualization purposes, making it easier to interpret and analyze the performance of a classification model.
from sklearn import metrics                                             # Importing error metrics here.

# Importing libraries for NLP operations.
# ------------------------------------------------------------------------------------------------------------------------------
import spacy                                                            # The spacy library is a popular tool for natural language processing (NLP) in Python. It provides various NLP functionalities, such as tokenization, part-of-speech tagging, named entity recognition, dependency parsing, and more.
import gensim.downloader as api                                         # The api module in Gensim allows for easy access to pre-trained word embeddings, which are word representation vectors learned from large corpora of text. These pre-trained word embeddings can be used as features in natural language processing (NLP) and text mining tasks, such as word similarity, document classification, and text generation.

from gensim.models import KeyedVectors
wv = KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary=True)                               
                                                                        # The 'word2vec-google-news-300' pre-trained word embeddings are trained on a large corpus of Google News articles and consist of 300-dimensional word vectors. These word vectors can be used to represent words in a meaningful way for various NLP tasks, such as word similarity, document classification, and text generation.
nlp = spacy.load("spacy_model_updated_v2")                              # loads the pre-trained English language model "en_core_web_lg" from the Spacy library. 

# ------------------------------------------------------------------------------------------------------------------------------
import warnings
warnings.simplefilter("ignore")                                         # Warnings have been turned off, after carefuly examining each of them
pd.set_option('display.max_columns', None)                              # To print the above output in a wider format. This attribute is used to set the no. of columns 
pd.set_option('display.max_rows', None)                                 # To print the above output in a wider format. This attribute is used to set the no. of columns 
pd.options.display.float_format = '{:.2f}'.format                       # To get rid of scientific notations used to disply large numbers. 

In [4]:
# ------------------------------------------------------------------------------------------------------------------------------

# Preprosessing the free-text data, by lemmatizing, removing stop words, and punctuations. This function returns a processed
# free-text data (one line) joined by space. review (str): Input text to be normalized, lowercase (bool): Flag indicating whether to convert the text to lowercase. remove_stopwords (bool): Flag indicating whether to remove stopwords.         punctuations_rm (bool): Flag indicating whether to remove punctuations

def normalize(review, lowercase, remove_stopwords, punctuations_rm):
    if lowercase:
        review = review.lower()                                        # Converting the text to lowercase if the `lowercase` flag is set
    doc = nlp(review)
    lemmatized = list()
    for token in doc:
          if remove_stopwords and not token.is_stop:                   # Checking if the token is a stopword and the `remove_stopwords` flag is set
            if punctuations_rm:
                if not token.is_punct:                                 # Checking if the token is a punctuation and the `punctuations_rm` flag is set
                    lemmatized.append(token.lemma_)                    # Lemmatizing the token and appending to the list
            else: 
                lemmatized.append(token.lemma_)
    return " ".join(lemmatized)                                        # Joining the lemmatized tokens with space and returning as normalized text


# ------------------------------------------------------------------------------------------------------------------------------

# Function to generate word embeddings using pre-trained word vectors

def generate_word_embeddings(sent):
    words = sent.split()                                               # Splitting the input sentence into words
    word_vectors = [wv[word] for word in words if word in wv]          # Extracting word vectors for words present in the pre-trained word vectors
    if word_vectors:
        return np.mean(word_vectors, axis=0)                           # Calculating the average word vector
    else:
        return np.zeros((300,))                                        # Returning a zero vector if no word vectors are found for the input sentence
    
    
# ------------------------------------------------------------------------------------------------------------------------------

# Function to create a new vector that keeps a check on the type of ner tags used in the sentences and mark it as 1 on encountering the respective ner tag, to further feed this as a feature to the model.

def generate_ner_tags(doc):
    ner_dict = dict.fromkeys(labels, 0)                                # Creating a dictionary to store NER labels and initializing all values to 0
    # Using a dictionary comprehension
    ner_dict = {labels: 0 for labels in labels}
    doc = nlp(doc)
    for tokens in doc.ents:
        if tokens.label_ in ner_dict:                                  # Checking if the NER label is present in the dictionary
            ner_dict[tokens.label_]=1                                  # Setting the value to 1 if the NER label is found
    return(list(ner_dict.values()))                                    # Converting the dictionary values to a list and returning



# ------------------------------------------------------------------------------------------------------------------------------

# Function to pre-process the given sentence and return a numpy array of features

    
def testing(sent):
    # Calling the normalize function to preprocess the sentence by removing stopwords and punctuations, and making it lowercase.
    sent = normalize(sent, lowercase=True, remove_stopwords=True, punctuations_rm=True)
    # Counting the number of tokens in the processed sentence using the nlp object, to pass it as a feature to the model.
    sent_length = len(nlp(sent))
    # Generating the word embeddings for the processed sentence using the generate_word_embeddings function.
    sent_vector = generate_word_embeddings(sent)
    # Generating named entity recognition (NER) tags for the processed sentence using the generate_ner_tags function
    ner_tag = generate_ner_tags(sent)
    # Reshaping the extracted features to have the required shape for concatenation
    x3 = np.reshape(np.array(sent_length), (-1, 1))
    x1 = np.reshape(np.array(sent_vector), (-1, 1))
    x2 = np.reshape(np.array(ner_tag), (-1, 1))
    # Concatenating the features horizontally using the concatenate function
    test = np.concatenate((x1, x2, x3),axis=0)
    # Transposing the array to have the shape (1, n) where n is the total number of features
    return test.T

# ------------------------------------------------------------------------------------------------------------------------------

# The predictions_final function takes a sentence as input, applies pre-trained models to predict the categories for the input
# sentence, and returns a list of the top three predicted categories. The function checks the predicted probability of the sentence 
# belonging to each category using a threshold, sorts the predicted categories by their probabilities in descending order, 
# and returns the top three predicted categories. If no categories were predicted, the function returns ["Others"].

def predictions_final(sent):
    # Initializing an empty list and dictionary to store the predicted categories and their probabilities
    ls = []
    mydict = {}
    # Checking if the predicted probability of the sentence belonging to the 'Product' category is greater than the category threshold.
    if models['Product'][0].predict_proba(sent)[:,1] > (models['Product'][1]):
        mydict["Product"] = (models['Product'][0].predict_proba(sent)[:,1])
    # Checking if the predicted probability of the sentence belonging to the 'Cost' category is greater than the category threshold
    if models['Cost'][0].predict_proba(sent)[:,1] > models['Cost'][1]:
        mydict["Cost"] = (models['Cost'][0].predict_proba(sent)[:,1])
    # Checking if the predicted probability of the sentence belonging to the 'Sales' category is greater than the category threshold
    if models['Sales'][0].predict_proba(sent)[:,1] > models['Sales'][1]:
        mydict["Sales"] = (models['Sales'][0].predict_proba(sent)[:,1])
    # Checking if the predicted probability of the sentence belonging to the 'Customer Service' category is greater than the category threshold
    if models['Customer Service'][0].predict_proba(sent)[:,1] > models['Customer Service'][1]:
        mydict["Customer Service"] = (models['Customer Service'][0].predict_proba(sent)[:,1])
    # Checking if the predicted probability of the sentence belonging to the 'Training' category is greater than the category threshold
    if models['Training'][0].predict_proba(sent)[:,1] > models['Training'][1]:
        mydict["Training"] = (models['Training'][0].predict_proba(sent)[:,1])
    
    # Sorting the dictionary by values (i.e. probabilities) in descending order and taking the top three predicted categories
    mydict = dict(sorted(mydict.items(), key=lambda x: x[1], reverse = True)[:3])
    # If no categories were predicted, return 'Others'
    if len(mydict) == 0:
        return(["Others"])
    else:
        # Appending the predicted categories to the list and returning it
        for i,j in mydict.items():
#             print(i, " -> ", '{:.2%}'.format(j[0]))
              ls.append(i)
        return(ls)
    
# ------------------------------------------------------------------------------------------------------------------------------


In [5]:
df = pd.read_csv("NPS Data for ASU Team.csv")

In [6]:
# Removing all the rows with nulls in the experience, and the primary category columns for the purpose of building this model.
df.drop(df[df['How_can_we_improve_your_experience__c'].isnull()].index, axis=0, inplace=True)

In [7]:
# Dropping rows from the DataFrame where the 'Primary_Category_1__c' column is not null
df.drop(df[df['Primary_Category_1__c'].notnull()].index, axis=0, inplace=True)

In [8]:
pickle_in = open('model', 'rb')                                                # Opening Classifier in read-byte mode. 
models = pickle.load(pickle_in)                                                # Pickle load is used to load pickled data from a file-like object.

In [9]:
labels = list(nlp.get_pipe('ner').labels)

In [11]:
# The values of the processed column are obtained by applying the testing function to the values of the How_can_we_improve_your_experience__c column of df. 
# The processed column will contain the transposed feature vectors for each row of the How_can_we_improve_your_experience__c column.
df['processed'] = df['How_can_we_improve_your_experience__c'].apply(testing)

In [12]:
# The predictions_final function is applied to each of these preprocessed sentences to obtain the predicted categories for each sentence. \
# The resulting prediction column in the DataFrame df contains the top three predicted categories for each sentence in the processed column.
df['prediction'] = df['processed'].apply(predictions_final)

In [14]:
# The lambda function takes a list x as input, extracts the first three elements (if they exist) and returns a Pandas Series with three values. 
# If x has fewer than three elements, the remaining values are set to None. The output of the lambda function is a DataFrame with three columns. 
df[['Primary_Category_1__c', 'Primary_Category_2__c', 'Primary_Category_3__c']] = df['prediction'].apply(lambda x: pd.Series([x[0], x[1] if len(x) > 1 else None, x[2] if len(x) > 2 else None]))

In [16]:
# Dropping prediction, and processed columns
df.drop(['prediction','processed'], inplace=True, axis=1)

In [18]:
# Saving all the predictions
df.to_csv('Categorized_data_{}.csv'.format(datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))