# Imports

## Libraries & Packages

In [179]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

import re

## Data

In [3]:
PATH = "../Data/"

In [10]:
df = pd.read_csv(PATH + "CASE & SURVEY.csv")

In [12]:
drop_cols = ["DURATION_HRS", "RESOLVER_USERNAME",
             "RESOLVER_NAME", "CASE_OWNER_S_MANAGER_EMAIL__C"
            ]

df.drop(columns=drop_cols, inplace=True)

# EDA

In [15]:
df.head()

Unnamed: 0,CASE_NUMBER,CASE_SYMPTOM,CASE_DIAGNOSIS,CASE_STATUS,CASE_ORIGIN,CASE_CREATED_DATETIME,CASE_CLOSED_DATETIME,ISESCALATED,GROUPING,EFFORT_RATING__C,RATING__C,ADDITIONAL_COMMENTS__C,NPS_POSITIVE,NPS_NEUTRAL,NPS_NEGATIVE
0,13673869,Application Issue,Application How to Use,Closed,Contact Us,2024-06-24T08:48:04.000Z,2024-06-25T06:41:54.000Z,False,TEAM 1,Strongly Agree,10.0,,1,0,0
1,13668140,Application Issue,Application Malfunction,Closed,Phone,2024-06-20T20:04:30.000Z,2024-06-20T22:33:02.000Z,False,TEAM 1,Strongly Agree,10.0,,1,0,0
2,13674782,Login Failure,Application Malfunction,Closed,Phone,2024-06-24T12:29:30.000Z,2024-06-24T18:41:58.000Z,False,TEAM 3,Agree,9.0,,1,0,0
3,13669283,How To Use,Content Explanation,Closed,Contact Us,2024-06-21T07:30:45.000Z,2024-06-24T05:48:06.000Z,False,TEAM 3,Somewhat Agree,6.0,,0,1,0
4,13674563,Information Request,Application Malfunction,Closed,Phone,2024-06-24T11:43:59.000Z,2024-06-24T16:59:58.000Z,False,TEAM 1,Agree,10.0,,1,0,0


In [17]:
df["RATING__C"].value_counts()

RATING__C
10.0    2551
9.0      822
8.0      405
7.0      223
5.0      146
6.0      107
0.0       91
3.0       49
4.0       45
1.0       43
2.0       43
Name: count, dtype: int64

In [19]:
df.groupby(["RATING__C"])["NPS_POSITIVE"].sum()

RATING__C
0.0        0
1.0        0
2.0        0
3.0        0
4.0        0
5.0        0
6.0        0
7.0        0
8.0      405
9.0      822
10.0    2551
Name: NPS_POSITIVE, dtype: int64

In [21]:
df.groupby(["RATING__C"])["NPS_NEGATIVE"].sum()

RATING__C
0.0     91
1.0     43
2.0     43
3.0     49
4.0     45
5.0      0
6.0      0
7.0      0
8.0      0
9.0      0
10.0     0
Name: NPS_NEGATIVE, dtype: int64

In [23]:
df["CASE_SYMPTOM"].value_counts()

CASE_SYMPTOM
Application Issue      1656
How To Use              774
Content Explanation     359
Login Failure           357
Content Not Found       317
Information Request     272
Content Integrity       238
Password Assistance     219
Add-in Missing          214
Connectivity            102
Content Addition          8
Content Slowness          6
Technical                 4
Name: count, dtype: int64

In [25]:
df["CASE_DIAGNOSIS"].value_counts()

CASE_DIAGNOSIS
Application/Desktop/Malfunction             792
Content Explanation                         567
Application Malfunction                     505
Content Search                              447
Application/Desktop/How to Use              318
Application How to Use                      278
Application/Desktop/Crash                   232
Manual Password Reset                       169
Password Reset                              112
Application/Desktop/Func. NA                107
Application/Desktop/Install                  75
Application Crash                            75
Content Not Available                        71
Entitlement Confirm ID/Perm                  62
Application Func. Not Available              62
Application/Infrastructure/How to Use        57
Application/Desktop/File Conversion          55
Application/Desktop/How to Use               54
Application Install                          47
Self-password Reset Issue                    46
Content Incorrect        

In [27]:
df["CASE_STATUS"].value_counts()

CASE_STATUS
Closed       4501
Cancelled      23
Rejected        1
Delivered       1
Name: count, dtype: int64

# Feature Engineering

### Convert Timezone

In [29]:
df["CASE_CREATED_DATETIME"] = pd.to_datetime(df["CASE_CREATED_DATETIME"], utc=True).dt.tz_localize(None)
df["CASE_CLOSED_DATETIME"] = pd.to_datetime(df["CASE_CLOSED_DATETIME"], utc=True).dt.tz_localize(None)

### Get Duration
- Total Seconds
- Total Minutes

In [32]:
df["DURATION_INSECONDS"] = (df["CASE_CLOSED_DATETIME"] - df["CASE_CREATED_DATETIME"]) / pd.Timedelta(seconds=1)
df["DURATION_INMINUTES"] = (df["CASE_CLOSED_DATETIME"] - df["CASE_CREATED_DATETIME"]) / pd.Timedelta(minutes=1)

### NPS

In [35]:
eval_bins = [0, 4, 7, np.inf]
eval_labels = [-1, 0, 1]

df["NPS"] = \
pd.cut(df["RATING__C"],
       bins=eval_bins,
       labels=eval_labels,
       include_lowest=True
      )

### Regex Cleanup

In [38]:
df[df["CASE_NUMBER"] == 13671561]

Unnamed: 0,CASE_NUMBER,CASE_SYMPTOM,CASE_DIAGNOSIS,CASE_STATUS,CASE_ORIGIN,CASE_CREATED_DATETIME,CASE_CLOSED_DATETIME,ISESCALATED,GROUPING,EFFORT_RATING__C,RATING__C,ADDITIONAL_COMMENTS__C,NPS_POSITIVE,NPS_NEUTRAL,NPS_NEGATIVE,DURATION_INSECONDS,DURATION_INMINUTES,NPS
19,13671561,Information Request,Content Not Available,Closed,Chat,2024-06-21 19:37:12,2024-06-24 22:14:50,False,TEAM 2,Agree,9.0,.,1,0,0,268658.0,4477.633333,1


In [40]:
df[df["CASE_NUMBER"] == 13671561]["ADDITIONAL_COMMENTS__C"].values[0]

'.'

In [42]:
def clean_text(text):
    if type(text) == float:
        return text
    else:
        #format
        text = text.lower()
        #expanding contractions
        text = re.sub(r"i'm","i am",text)
        text = re.sub(r"he's","he is",text)
        text = re.sub(r"she's","she is",text)
        text = re.sub(r"wouldn't","would not",text)
        text = re.sub(r"wasn't","was not",text)
        text = re.sub(r"hasn't","has not",text)
        text = re.sub(r"haven't","have not",text)
        text = re.sub(r"aren't","are not",text)
        text = re.sub(r"couldn't","could not",text)
        text = re.sub(r"can't","cannot",text)
        #regex
        #remove symbols
        text = re.sub(r"[-()\"#/@;:<>{}=~|.?,]", "", text)
    return text

In [44]:
df[df["CASE_NUMBER"] == 13671561]["ADDITIONAL_COMMENTS__C"].apply(clean_text)

19    
Name: ADDITIONAL_COMMENTS__C, dtype: object

In [71]:
clean_text(df["ADDITIONAL_COMMENTS__C"][4523])

"it took 2 weeks for my query to be answered and the resolution didn't answer the question at all"

### Removing aggregated columns

In [45]:
drop_cols = ["RATING__C", "NPS_POSITIVE",
             "NPS_NEUTRAL", "NPS_NEGATIVE"
            ]

df.drop(columns=drop_cols, inplace=True)

## Text Cleaning

In [155]:
def decontractor(sentence):
    """
    Expanding contractions
    """

    if type(sentence) != str:
        return sentence

    else:
        sentence = re.sub(r"n\'t", " not", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"\'s", " is", sentence)
        sentence = re.sub(r"\'d", " would", sentence)
        sentence = re.sub(r"\'ll", " will", sentence)
        sentence = re.sub(r"\'t", " not", sentence)
        sentence = re.sub(r"\'ve", " have", sentence)
        sentence = re.sub(r"\'m", " am", sentence)
        return sentence

In [181]:
def clean_text(text):
    """
    remove punctuation and special characters
    lower case all the text as well
    """

    if type(text) != str:
        return text
        
    else:
    
        # Create a translation table to 
        # !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r\x0b\x0c
        translator = str.maketrans('', '', string.punctuation + string.printable.replace(' ','')[62:])
        text = text.translate(translator)
        
        #clean_text = re.sub(r'[^\w\s]', '', text) # Alternative route using re
        text = re.sub(r'http\S+', '', text)
    
        # Remove leading and trailing whitespace
        text = text.strip()
        text = text.lower()
    
        return text

### Before and After

In [160]:
df["ADDITIONAL_COMMENTS__C"][4523]

"It took 2 weeks for my query to be answered and the resolution didn't answer the question at all"

In [162]:
clean_text(decontractor(df["ADDITIONAL_COMMENTS__C"][4523]))

'it took 2 weeks for my query to be answered and the resolution did not answer the question at all'

## Apply Text Cleaning

In [172]:
df["ADDITIONAL_COMMENTS__C"] = df["ADDITIONAL_COMMENTS__C"].apply(lambda x: clean_text(decontractor(x)))

## Lemmatizing

In [195]:
def remove_stop_words(text):
    if type(text) != str:
        return text

    else:
        # Tokenizing text
        tokens = nltk.word_tokenize(text)
        # Remove stop words
        clean_tokens = [token for token in tokens if token not in stop_words]
        # Putting back the words together
        clean_text = " ".join(clean_tokens)

        return clean_text

In [197]:
df["LEMMATIZED_COMMENTS"] = df["ADDITIONAL_COMMENTS__C"].apply(remove_stop_words)

In [199]:
df["LEMMATIZED_COMMENTS"]

0                                                     NaN
1                                                     NaN
2                                                     NaN
3                                                     NaN
4                                                     NaN
                              ...                        
4521                                         could faster
4522                                           thank much
4523    took 2 weeks query answered resolution answer ...
4524                      paul massoni chris matheis best
4525                                                  NaN
Name: LEMMATIZED_COMMENTS, Length: 4526, dtype: object