# Imports

## Libraries & Packages

In [29]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment_scorer = SentimentIntensityAnalyzer()

import re
import string
import spacy

## Data

In [2]:
PATH = "../Data/"

In [3]:
df = pd.read_csv(PATH + "CASE & SURVEY.csv")

In [4]:
drop_cols = ["DURATION_HRS", "RESOLVER_USERNAME",
             "RESOLVER_NAME", "CASE_OWNER_S_MANAGER_EMAIL__C"
            ]

df.drop(columns=drop_cols, inplace=True)

# EDA

In [5]:
df.head()

Unnamed: 0,CASE_NUMBER,CASE_SYMPTOM,CASE_DIAGNOSIS,CASE_STATUS,CASE_ORIGIN,CASE_CREATED_DATETIME,CASE_CLOSED_DATETIME,ISESCALATED,GROUPING,EFFORT_RATING__C,RATING__C,ADDITIONAL_COMMENTS__C,NPS_POSITIVE,NPS_NEUTRAL,NPS_NEGATIVE
0,13673869,Application Issue,Application How to Use,Closed,Contact Us,2024-06-24T08:48:04.000Z,2024-06-25T06:41:54.000Z,False,TEAM 1,Strongly Agree,10.0,,1,0,0
1,13668140,Application Issue,Application Malfunction,Closed,Phone,2024-06-20T20:04:30.000Z,2024-06-20T22:33:02.000Z,False,TEAM 1,Strongly Agree,10.0,,1,0,0
2,13674782,Login Failure,Application Malfunction,Closed,Phone,2024-06-24T12:29:30.000Z,2024-06-24T18:41:58.000Z,False,TEAM 3,Agree,9.0,,1,0,0
3,13669283,How To Use,Content Explanation,Closed,Contact Us,2024-06-21T07:30:45.000Z,2024-06-24T05:48:06.000Z,False,TEAM 3,Somewhat Agree,6.0,,0,1,0
4,13674563,Information Request,Application Malfunction,Closed,Phone,2024-06-24T11:43:59.000Z,2024-06-24T16:59:58.000Z,False,TEAM 1,Agree,10.0,,1,0,0


In [6]:
df["RATING__C"].value_counts()

RATING__C
10.0    2551
9.0      822
8.0      405
7.0      223
5.0      146
6.0      107
0.0       91
3.0       49
4.0       45
1.0       43
2.0       43
Name: count, dtype: int64

In [7]:
df.groupby(["RATING__C"])["NPS_POSITIVE"].sum()

RATING__C
0.0        0
1.0        0
2.0        0
3.0        0
4.0        0
5.0        0
6.0        0
7.0        0
8.0      405
9.0      822
10.0    2551
Name: NPS_POSITIVE, dtype: int64

In [8]:
df.groupby(["RATING__C"])["NPS_NEGATIVE"].sum()

RATING__C
0.0     91
1.0     43
2.0     43
3.0     49
4.0     45
5.0      0
6.0      0
7.0      0
8.0      0
9.0      0
10.0     0
Name: NPS_NEGATIVE, dtype: int64

In [9]:
df["CASE_SYMPTOM"].value_counts()

CASE_SYMPTOM
Application Issue      1656
How To Use              774
Content Explanation     359
Login Failure           357
Content Not Found       317
Information Request     272
Content Integrity       238
Password Assistance     219
Add-in Missing          214
Connectivity            102
Content Addition          8
Content Slowness          6
Technical                 4
Name: count, dtype: int64

In [10]:
df["CASE_DIAGNOSIS"].value_counts()

CASE_DIAGNOSIS
Application/Desktop/Malfunction             792
Content Explanation                         567
Application Malfunction                     505
Content Search                              447
Application/Desktop/How to Use              318
Application How to Use                      278
Application/Desktop/Crash                   232
Manual Password Reset                       169
Password Reset                              112
Application/Desktop/Func. NA                107
Application/Desktop/Install                  75
Application Crash                            75
Content Not Available                        71
Entitlement Confirm ID/Perm                  62
Application Func. Not Available              62
Application/Infrastructure/How to Use        57
Application/Desktop/File Conversion          55
Application/Desktop/How to Use               54
Application Install                          47
Self-password Reset Issue                    46
Content Incorrect        

In [11]:
df["CASE_STATUS"].value_counts()

CASE_STATUS
Closed       4501
Cancelled      23
Rejected        1
Delivered       1
Name: count, dtype: int64

# Feature Engineering

### Convert Timezone

In [12]:
df["CASE_CREATED_DATETIME"] = pd.to_datetime(df["CASE_CREATED_DATETIME"], utc=True).dt.tz_localize(None)
df["CASE_CLOSED_DATETIME"] = pd.to_datetime(df["CASE_CLOSED_DATETIME"], utc=True).dt.tz_localize(None)

### Get Duration
- Total Seconds
- Total Minutes

In [13]:
df["DURATION_INSECONDS"] = (df["CASE_CLOSED_DATETIME"] - df["CASE_CREATED_DATETIME"]) / pd.Timedelta(seconds=1)
df["DURATION_INMINUTES"] = (df["CASE_CLOSED_DATETIME"] - df["CASE_CREATED_DATETIME"]) / pd.Timedelta(minutes=1)

### NPS

In [14]:
eval_bins = [0, 4, 7, np.inf]
eval_labels = [-1, 0, 1]

df["NPS"] = \
pd.cut(df["RATING__C"],
       bins=eval_bins,
       labels=eval_labels,
       include_lowest=True
      )

### Removing aggregated columns

In [15]:
drop_cols = ["RATING__C", "NPS_POSITIVE",
             "NPS_NEUTRAL", "NPS_NEGATIVE"
            ]

df.drop(columns=drop_cols, inplace=True)

### Regex Cleanup

In [16]:
def clean_text(text):
    if type(text) == float:
        return text
    else:
        #format
        text = text.lower()
        #expanding contractions
        text = re.sub(r"i'm","i am",text)
        text = re.sub(r"he's","he is",text)
        text = re.sub(r"she's","she is",text)
        text = re.sub(r"wouldn't","would not",text)
        text = re.sub(r"wasn't","was not",text)
        text = re.sub(r"hasn't","has not",text)
        text = re.sub(r"haven't","have not",text)
        text = re.sub(r"aren't","are not",text)
        text = re.sub(r"couldn't","could not",text)
        text = re.sub(r"can't","cannot",text)
        #regex
        #remove symbols
        text = re.sub(r"[-()\"#/@;:<>{}=~|.?,]", "", text)
    return text

In [17]:
df[df["CASE_NUMBER"] == 13671561]

Unnamed: 0,CASE_NUMBER,CASE_SYMPTOM,CASE_DIAGNOSIS,CASE_STATUS,CASE_ORIGIN,CASE_CREATED_DATETIME,CASE_CLOSED_DATETIME,ISESCALATED,GROUPING,EFFORT_RATING__C,ADDITIONAL_COMMENTS__C,DURATION_INSECONDS,DURATION_INMINUTES,NPS
19,13671561,Information Request,Content Not Available,Closed,Chat,2024-06-21 19:37:12,2024-06-24 22:14:50,False,TEAM 2,Agree,.,268658.0,4477.633333,1


#### Before Regex Cleanup

In [18]:
df[df["CASE_NUMBER"] == 13671561]["ADDITIONAL_COMMENTS__C"].values[0]

'.'

#### After Regex Cleanup

In [19]:
df[df["CASE_NUMBER"] == 13671561]["ADDITIONAL_COMMENTS__C"].apply(clean_text)

19    
Name: ADDITIONAL_COMMENTS__C, dtype: object

## Text Cleaning

### Handling Contractors

In [21]:
def decontractor(sentence):
    """
    Expanding contractions
    """

    if type(sentence) != str:
        return sentence

    else:
        sentence = re.sub(r"n\'t", " not", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"\'s", " is", sentence)
        sentence = re.sub(r"\'d", " would", sentence)
        sentence = re.sub(r"\'ll", " will", sentence)
        sentence = re.sub(r"\'t", " not", sentence)
        sentence = re.sub(r"\'ve", " have", sentence)
        sentence = re.sub(r"\'m", " am", sentence)
        return sentence

### Removing white spaces

In [22]:
def clean_text(text):
    """
    remove punctuation and special characters
    lower case all the text as well
    """

    if type(text) != str:
        return text
        
    else:
    
        # Create a translation table to 
        # !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r\x0b\x0c
        translator = str.maketrans('', '', string.punctuation + string.printable.replace(' ','')[62:])
        text = text.translate(translator)
        
        #clean_text = re.sub(r'[^\w\s]', '', text) # Alternative route using re
        #text = re.sub(r'http\S+', '', text)
    
        # Remove leading and trailing whitespace
        text = text.strip()
        text = text.lower()
    
        return text

#### Before and After

In [23]:
df["ADDITIONAL_COMMENTS__C"][4523]

"It took 2 weeks for my query to be answered and the resolution didn't answer the question at all"

In [24]:
clean_text(decontractor(df["ADDITIONAL_COMMENTS__C"][4523]))

'it took 2 weeks for my query to be answered and the resolution did not answer the question at all'

### Apply Text Cleaning

In [25]:
df["ADDITIONAL_COMMENTS__C"] = df["ADDITIONAL_COMMENTS__C"].apply(lambda x: clean_text(decontractor(x)))

### Lemmatizing

In [26]:
def remove_stop_words(text):
    if type(text) != str:
        return text

    else:
        # Tokenizing text
        tokens = nltk.word_tokenize(text)
        # Remove stop words
        clean_tokens = [token for token in tokens if token not in stop_words]
        # Putting back the words together
        clean_text = " ".join(clean_tokens)

        return clean_text

In [27]:
df["LEMMATIZED_COMMENTS"] = df["ADDITIONAL_COMMENTS__C"].apply(remove_stop_words)

In [28]:
df[["ADDITIONAL_COMMENTS__C", "LEMMATIZED_COMMENTS"]].dropna()

Unnamed: 0,ADDITIONAL_COMMENTS__C,LEMMATIZED_COMMENTS
5,very helpful thank you,helpful thank
7,lseg is people were very professional and help...,lseg people professional helpful appreciated e...
9,your colleague was very patient and helpful,colleague patient helpful
10,excellent service,excellent service
11,i was immediately told that my query was not p...,immediately told query possible done pushing f...
...,...,...
4514,timely response was received after my initial ...,timely response received initial query continu...
4521,could be faster,could faster
4522,thank you very much,thank much
4523,it took 2 weeks for my query to be answered an...,took 2 weeks query answered resolution answer ...


### Sentiment Scoring

In [30]:
df["ADDITIONAL_COMMENTS__C"][5]

'very helpful  thank you'

In [40]:
sentiment_scorer.polarity_scores(df["ADDITIONAL_COMMENTS__C"][5])

{'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.707}

In [33]:
sentiment_scorer.polarity_scores(df["LEMMATIZED_COMMENTS"][5])

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.6486}

In [78]:
def vader_scorer(series_text, prefix='', drop_text=True):

    """
    Accepts Pandas Series string
    """
    
    text_collector = []
    neg_collector = []
    neu_collector = []
    pos_collector = []
    compound_collector = []

    for text in series_text:
    
        if type(text) != str:
            text_collector.append(text)
            neg_collector.append(np.nan)
            neu_collector.append(np.nan)
            pos_collector.append(np.nan)
            compound_collector.append(np.nan)
    
        else:
            text_collector.append(text)
            
            # Score Sentiment
            sentiment_scores = sentiment_scorer.polarity_scores(text)
            neg_collector.append(sentiment_scores['neg'])
            neu_collector.append(sentiment_scores['neu'])
            pos_collector.append(sentiment_scores['pos'])
            compound_collector.append(sentiment_scores['compound'])

    scored_dataframe = pd.concat([pd.Series(text_collector, name=prefix+'text'),
                                 pd.Series(neg_collector, name=prefix+'neg'),
                                 pd.Series(neu_collector, name=prefix+'neu'),
                                 pd.Series(pos_collector, name=prefix+'pos'),
                                 pd.Series(compound_collector, name=prefix+'compound')]
                                 ,axis=1
                                )

    if drop_text == True:
        scored_dataframe.drop(columns=scored_dataframe.columns[0], axis=1, inplace=True)
    
    return scored_dataframe

In [81]:
df_vader = \
pd.concat([vader_scorer(df["ADDITIONAL_COMMENTS__C"],prefix='non-lemma_'),
           vader_scorer(df["LEMMATIZED_COMMENTS"],prefix='lemma_')
          ]
          ,axis=1
         )

In [102]:
df_vader

Unnamed: 0,non-lemma_neg,non-lemma_neu,non-lemma_pos,non-lemma_compound,lemma_neg,lemma_neu,lemma_pos,lemma_compound
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
...,...,...,...,...,...,...,...,...
4521,0.0,1.000,0.000,0.0000,0.0,1.000,0.000,0.0000
4522,0.0,0.545,0.455,0.3612,0.0,0.286,0.714,0.3612
4523,0.0,1.000,0.000,0.0000,0.0,1.000,0.000,0.0000
4524,0.0,0.625,0.375,0.6369,0.0,0.488,0.512,0.6369


In [116]:
df_vader = \
pd.concat([df_vader.loc[:, df_vader.columns.str.contains('neg')].sum(axis=1)/2,
           df_vader.loc[:, df_vader.columns.str.contains('neu')].sum(axis=1)/2,
           df_vader.loc[:, df_vader.columns.str.contains('pos')].sum(axis=1)/2,
           df_vader.loc[:, df_vader.columns.str.contains('compound')].sum(axis=1)/2
          ]
          ,axis=1
         ).rename(columns={0:'neg', 1:'neu', 2:'pos', 3:'compound'})

In [117]:
df_vader

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.0000,0.0000,0.0000
1,0.0,0.0000,0.0000,0.0000
2,0.0,0.0000,0.0000,0.0000
3,0.0,0.0000,0.0000,0.0000
4,0.0,0.0000,0.0000,0.0000
...,...,...,...,...
4521,0.0,1.0000,0.0000,0.0000
4522,0.0,0.4155,0.5845,0.3612
4523,0.0,1.0000,0.0000,0.0000
4524,0.0,0.5565,0.4435,0.6369


In [118]:
df_vader.loc[4522]

neg         0.0000
neu         0.4155
pos         0.5845
compound    0.3612
Name: 4522, dtype: float64

In [119]:
df["ADDITIONAL_COMMENTS__C"].loc[4522]

'thank you very much'

In [120]:
df["LEMMATIZED_COMMENTS"].loc[4522]

'thank much'