In [759]:
import pandas as pd

In [760]:
# reading the data 

In [761]:
data = pd.read_csv("data_for_EDA.csv")

In [762]:
data.shape

(1547, 7)

In [763]:
# Earlier all description and summary was discarded

In [764]:
# In this analysis, we will try to extract useful info from the text data

In [765]:
# all columns are text data 
data.dtypes

Analytics_Category_value    object
Assignee_displayName        object
Issue_Type_name             object
Summary                     object
SN_Case_Number              object
u_category                  object
description                 object
dtype: object

In [766]:
data.isna().sum()

Analytics_Category_value     45
Assignee_displayName          0
Issue_Type_name               0
Summary                       0
SN_Case_Number                0
u_category                   97
description                 105
dtype: int64

In [767]:
# Unique counts can be seen in describe columns

In [768]:
data["description"].head(10)

0    Need data for the CTBRI divestiture. This will...
1    Agilify needs an urgent estimate for effort of...
2    Agilify needs an urgent estimate for effort of...
3    In partnership with The Resource Group, the P2...
4    Will you please get access for Scott Walters t...
5    Hello,\r\n\r\nCurrently there is a stored proc...
6    Hello,\r\n\r\nCurrently there is a stored proc...
7    Good afternoon,\r\n\r\nCan you please grant ac...
8    Good afternoon,\r\n\r\nCan you please grant ac...
9    Good morning.\r\r\nI am requesting changes to ...
Name: description, dtype: object

In [769]:
data["Summary"].head(10)

0              Create SSRS report with PPE Date prompt
1    Requirements: Recycles Duplicates BP automatio...
2    SQL: Recycles Duplicates BP automation product...
3                          Cost of Ownership Scorecard
4              Grant Access for Payroll Folder in SSRS
5    Migrate LCE Sus Recall SQL objects to new db i...
6    Migrate LCE Sus Recall stored proc from Automa...
7            LCE SSRS Report Access in 010 for Agilify
8    Grant 010 SSRS LCE Report Folder Access to Agi...
9    Changes to Existing Report: MSC_Associates_Detail
Name: Summary, dtype: object

In [770]:
# Checking for duplicate values

In [771]:
data.duplicated(keep = "first").value_counts()

False    1546
True        1
dtype: int64

In [772]:
# Now we will analyze the summary and description 

In [773]:
data = data.drop_duplicates(keep ="first")

In [774]:
data["Summary"][1123]

'Requirements: Add email notification to existing process'

In [775]:
data["description"][999]

nan

In [776]:
data.isna().sum()

Analytics_Category_value     45
Assignee_displayName          0
Issue_Type_name               0
Summary                       0
SN_Case_Number                0
u_category                   97
description                 105
dtype: int64

In [777]:
(data["description"][65])

nan

In [778]:
(data.isna().sum()/data.shape[0]*100).round(2)

Analytics_Category_value    2.91
Assignee_displayName        0.00
Issue_Type_name             0.00
Summary                     0.00
SN_Case_Number              0.00
u_category                  6.27
description                 6.79
dtype: float64

In [779]:
data.columns

Index(['Analytics_Category_value', 'Assignee_displayName', 'Issue_Type_name',
       'Summary', 'SN_Case_Number', 'u_category', 'description'],
      dtype='object')

In [780]:
data = data.dropna(subset = ["Analytics_Category_value", "u_category", "description"])

In [781]:
data = data.reset_index()

In [782]:
data.drop(["index"], axis =1, inplace = True)

In [783]:
data["Assignee_displayName"].replace(["Ancuta.Goia@ascension.org", "Deepa.Naidu@ascension.org"], 
                                     ["Ancuta Goia", "Deepa Naidu"], inplace =True)

In [786]:
data.shape

(1406, 7)

In [787]:
# Performing text preprocessing in "description" and "Summary" columns

In [788]:
# Expand Contractions in data

In [718]:
import re

In [719]:
def remove_punctuation(text):
    text = re.sub(r'(([\w]+)\@(\w+\.\w+)(\w+)?)',' ',text)      # Removes  email
    text = re.sub(r'[^\w\s]',' ',text)                          # Removes punctuation
    text = re.sub(r"\n|\r",' ', text)                           # Removes New Lines
    text = re.sub(r'[^a-zA-Z]',' ',text)                        # Remove Numeric Values
    return text

In [720]:
def filter_string(text):
    result = re.sub('(CONFIDENTIAL\sCOMMUNICATION:[\s\S]*)', '', text)  # Removes Confidential Communication Para
    result = re.sub('(CONFIDENTIALITY\sNOTICE:[\s\S]*)', '', result)  # Removes Confidentiality Notice Para
    result = re.sub(r'http\S+', '', result)                         # Removes Hyperlinks
    result = re.sub('[^A-Za-z]+', ' ', result)               # Removes Special Characters and selecting only alphabets
    return result

In [721]:
for x in range(0, data.shape[0]):
    textdata = filter_string(str(data["description"][x]))
    textdata = remove_punctuation(textdata)
    data.loc[x, ("description")] = textdata

In [722]:
## usage of chained indexing resulted in warning

In [723]:
# hence, loc was used instead

In [724]:
data["description"][999]

'Is there a way to see who has been given access to review these documents A list of the user s set up Thank you'

In [725]:
data["description"].value_counts().size

736

In [726]:
data["description"][662]

'Pega users are experiencing slowness with the Tableau report R in Radilo FA The slowness seems to be so severe that lines are no longer loading in Radilo or at least are taking a substantial amount of time Please add myself and Paige Horrigan to the watch list image cid bec b afa d d d e Kaylea Britton Pega Operations Lead ITIL Foundation Blue Prism Developer Certified Vincennes Road Indianapolis IN m e kaylea britton agilifyautomation com image cid image jpg D D D E cid image jpg D D D E cid image jpg D D D E cid image jpg D D D E '

In [727]:
for x in range(0, data.shape[0]):
    textdata = filter_string(str(data["Summary"][x]))
    textdata = remove_punctuation(textdata)
    data.loc[x, ("Summary")] = textdata

In [728]:
# Now we will remove stop words

In [729]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [730]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [731]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [732]:
stop_words = set(stopwords.words('english'))

In [733]:
for x in range(0, data.shape[0]):
    tokens = word_tokenize(str(data["description"][x]))
    resultlist = [i for i in tokens if not i in stop_words]
    result = " ".join(resultlist)
    data.loc[x, ("description")] = result

In [734]:
for x in range(0, data.shape[0]):
    tokens = word_tokenize(str(data["Summary"][x]))
    resultlist = [i for i in tokens if not i in stop_words]
    result = " ".join(resultlist)
    data.loc[x, ("Summary")] = result

In [735]:
data["description"][1134]

'Hello Analytics Team The existing Associate Listing report created MSC accounting team part project PMO requires modification The report link sent recipients listed Ryan Downey MSC Accounting Director requested following modification Currently user clicks report link sent email looks like SQL executed retrieve current information file associates listed report Even though report created monthly example associate changed business units last month Ryan would historical information previous business unit previous info associated employee associate reported six months ago Is possible change report retain historical information month report created Can filters added allow user select time period want see historical point time snap shot data versus current data system Please contact Doug Scherer clarify requirements ask questions Thank Doug Subject AssociateListing executed PM Report project PM The report accessible following address cid image png D C D DD Douglas Scherer Senior Business Ana

In [736]:
data["description"][662]

'Pega users experiencing slowness Tableau report R Radilo FA The slowness seems severe lines longer loading Radilo least taking substantial amount time Please add Paige Horrigan watch list image cid bec b afa e Kaylea Britton Pega Operations Lead ITIL Foundation Blue Prism Developer Certified Vincennes Road Indianapolis IN e kaylea britton agilifyautomation com image cid image jpg D D D E cid image jpg D D D E cid image jpg D D D E cid image jpg D D D E'

In [737]:
list1 = ["aa" , "bb" , "cc", "dd"]

In [738]:
str1 = " ".join(list1)

In [739]:
str1

'aa bb cc dd'

In [740]:
## Applying Stemming

In [741]:
from nltk.stem import PorterStemmer

In [742]:
stemmer = PorterStemmer()

In [743]:
for x in range(0, data.shape[0]):
    tokens = word_tokenize(str(data["description"][x]))
    result = ""
    for word in tokens:
        result += stemmer.stem(word) + " "
    data.loc[x, ("description")] = result

In [744]:
for x in range(0, data.shape[0]):
    tokens = word_tokenize(str(data["Summary"][x]))
    result = ""
    for word in tokens:
        result += stemmer.stem(word) +" "
    data.loc[x, ("Summary")] = result

In [745]:
data["description"][1]

'agilifi need urgent estim effort analyt work queri modif account payabl recycl autom thank paig cid bec b afa e paig horrigan georg deliveri client engag manag vincenn road indianapoli indiana e paig horrigan agilifyautom com cid imag jpg D D D E '

In [746]:
data["description"][662]

'pega user experienc slow tableau report R radilo FA the slow seem sever line longer load radilo least take substanti amount time pleas add paig horrigan watch list imag cid bec b afa e kaylea britton pega oper lead itil foundat blue prism develop certifi vincenn road indianapoli IN e kaylea britton agilifyautom com imag cid imag jpg D D D E cid imag jpg D D D E cid imag jpg D D D E cid imag jpg D D D E '

In [747]:
data["description"][1134]

'hello analyt team the exist associ list report creat msc account team part project pmo requir modif the report link sent recipi list ryan downey msc account director request follow modif current user click report link sent email look like sql execut retriev current inform file associ list report even though report creat monthli exampl associ chang busi unit last month ryan would histor inform previou busi unit previou info associ employe associ report six month ago Is possibl chang report retain histor inform month report creat can filter ad allow user select time period want see histor point time snap shot data versu current data system pleas contact doug scherer clarifi requir ask question thank doug subject associatelist execut PM report project PM the report access follow address cid imag png D C D DD dougla scherer senior busi analyst cspo csm vincenn circl indianapoli IN offic dougla scherer ascens org mailto dougla scherer ascens org root mission ascens serv client deliv valu e

In [748]:
# Stemming is working successfully

In [749]:
# Lemmatization using nltk

In [750]:
from nltk.stem import WordNetLemmatizer


In [751]:
lemmatizer=WordNetLemmatizer()

In [752]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [753]:
# Performing lemmatization on description and summary columns

In [754]:
for x in range(0, data.shape[0]):
    tokens = word_tokenize(str(data["description"][x]))
    result = ""
    for word in tokens:
        result += lemmatizer.lemmatize(word) + " "
    data.loc[x, ("description")] = result

In [755]:
for x in range(0, data.shape[0]):
    tokens = word_tokenize(str(data["Summary"][x]))
    result = ""
    for word in tokens:
        result += lemmatizer.lemmatize(word) + " "
    data.loc[x, ("Summary")] = result

In [756]:
data["description"][662]

'pega user experienc slow tableau report R radilo FA the slow seem sever line longer load radilo least take substanti amount time plea add paig horrigan watch list imag cid bec b afa e kaylea britton pega oper lead itil foundat blue prism develop certifi vincenn road indianapoli IN e kaylea britton agilifyautom com imag cid imag jpg D D D E cid imag jpg D D D E cid imag jpg D D D E cid imag jpg D D D E '

In [757]:
data["description"][1134]

'hello analyt team the exist associ list report creat msc account team part project pmo requir modif the report link sent recipi list ryan downey msc account director request follow modif current user click report link sent email look like sql execut retriev current inform file associ list report even though report creat monthli exampl associ chang busi unit last month ryan would histor inform previou busi unit previou info associ employe associ report six month ago Is possibl chang report retain histor inform month report creat can filter ad allow user select time period want see histor point time snap shot data versu current data system plea contact doug scherer clarifi requir ask question thank doug subject associatelist execut PM report project PM the report access follow address cid imag png D C D DD dougla scherer senior busi analyst cspo csm vincenn circl indianapoli IN offic dougla scherer ascens org mailto dougla scherer ascens org root mission ascens serv client deliv valu en