In [1]:
#Import the external packages into Python to perform various analyses
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import punkt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import json
import pymssql as pym
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#import custom functions
from cleanString import cleanString
from createStopWords import createStopWords
from fileDataLoad import fileDataLoad
from WarehouseDataExtract import WarehouseDataExtract

In [3]:
# using my test star wars dataset - if this was a real analysis you would use another source file, 
# either from Excel, CSV or the AMS Warehouse using WarehouseData Extract
# filename = r'C:\Users\jpuryear1\Documents\Python Scripts\starwars_data.xlsx'
# sheetname = 'Sheet1'
# inputDF = fileDataLoad(filename,sheetname)

In [5]:
# Imports the JSON file with User Id and Password
db_data = json.load(open(r'c:\Users/jpuryear1/Documents/Python Scripts/DB_connection.json'))
# Extracts user id and password to variables
WH_USER = db_data['userid']
WH_PW = db_data['password']
#EAI code to be queried
#print('Welcome to Incident Wordcloud Generation!')
eai_cd = input('Please type in the EAI code that you would like to generate a Top Terms List for: ')
print(f'You are pulling data for {eai_cd}')
#print('Data Extract Commencing - Please note that it will take several minutes to pull the data')
#Standard SQL to pull data for a given EAI Code
sql = f"""SELECT number, short_description, description, u_resolution_notes, u_incident_resolution_category, 
u_incident_resolution_subcateg, close_code, contact_type
  FROM [USBPMMetricsWhse].[dbo].[T_SRVNW_INCDN_SUM] a
  JOIN [USBPMMetricsWhse].[dbo].[T_SRVNW_AFCT_TASK_CI_SUM] b on a.number = b.task
  WHERE
  a.DW_REC_CUR_IND = 'Y'
  and b.ci_item like '{eai_cd}%'
  and a.opened_at > '2019-01-01 00:00:00'"""
# runs the WarehouseDataExtract function and imports the data to a dataframe
inputDF = WarehouseDataExtract(WH_USER, WH_PW, sql)
# print the head so you know which column to pull the text data from
inputDF['Summary'] = inputDF['short_description'].astype(str) + ' - ' + inputDF['description'].astype(str) + ' - ' + inputDF['u_resolution_notes'].astype(str)
print(f"There are {inputDF.shape[0]} observations and {inputDF.shape[1]} features in this dataset. \n")

Please type in the EAI code that you would like to generate a wordcloud for:  10966


You are pulling data for 10966


In [39]:
# Extract the Summary data from the dataframe into a list
TextReviewList = inputDF.loc[:,'Summary'].tolist()
#Create a "clean" list to hold the cleaned strings
TextCleanList = []
# Clean the strings from TextReviewList and copy the clean strings to TextCleanList
for str in TextReviewList:
    TextCleanList.append(cleanString(str))  
# adds a new column to the inputDF to hold the cleaned summary text
inputDF['CleanText'] = ''
# merge the cleaned summary back into the input dataframe
inputDF['CleanText'] = pd.Series(TextCleanList).values

In [40]:
# Create stopword list: use the "remove_words" list to add stop words to the list
remove_words = ('entered', 'auto', 'ams', 'arm', 'metlife', 'l1', 'us', 'corporate', 'billing', 'system', 'byauto','aalert', 'f090', 'e90', 'gssp', 'platformentered'
               ,'com', 'https', 'gto', 'bmc')
stopset = createStopWords(remove_words)
#need more information about what this does...
vectorizer = TfidfVectorizer(stop_words=stopset, analyzer = 'word')
#need more information about what this does...
tfidf_matrix = vectorizer.fit_transform(inputDF.loc[:,'CleanText'].tolist())
# break out the distinct words
feature_names = vectorizer.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [41]:
#convert the Df dataframe to True / False float values
df_boo = (df.loc[:] > 0).astype(int)

In [60]:
# sum each column
terms = df_boo.iloc[:, 1:].sum(axis=0)

In [51]:
# define each column as a pandas series
termSeries = pd.Series(terms.index.values, index = np.arange(len(terms)))
count = pd.Series(list(terms), index = np.arange(len(terms)))
# create the dataframe
term_df = pd.DataFrame(dict(termSeries = termSeries, count = count))
term_df = term_df[["termSeries", "count"]]

In [52]:
#sort in descending order
term_df.sort_values(["count"], ascending=False, inplace=True)
term_df.reset_index(inplace=True, drop=True)
# Print out the Top 25 Terms used at least once per Row
print(term_df.head(25))

   termSeries  count
0       error    355
1      online    323
2    platform    323
3         msg    318
4      public    317
5       topaz    317
6        step    314
7       10966    313
8    critical    312
9   keepalive    308
10  servicing    306
11       html    306
12    content    303
13  servi001a    302
14      match    302
15     impact    245
16     closed    237
17    manager    237
18     events    235
19   original    235
20     please    153
21      taken    114
22    actions    112
23    provide    111
24   incident    110


In [61]:
#term_df.head(25).plot(kind='barh')
#plt.show()