In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.linear_model import LinearRegression as lr
from sklearn.model_selection import train_test_split as tts 

In [None]:
ccd = pd.read_csv('../input/call-center-data/Call Center Data.csv')
ccd.head()

In [None]:
ccd.dtypes

In [None]:
ccd.describe

Removing the index data, since this is unecessary for further analysis. 

In [None]:
ccd = ccd.drop(['Index'], axis=1)
ccd.head()

In order to review the correlation between all fields, some data needs to be manipulated.  There are several fields that are objects, and these need to be converted into numeric data (float) in order to determine how they relate to other fields in the dataframe. 

In [None]:
# Data Cleaning/Conversion 

ccd_time = ccd

# Convertine to datetime values 
ccd_time['Answer Speed (AVG)'] = pd.to_datetime(ccd_time['Answer Speed (AVG)'])
ccd_time['Talk Duration (AVG)'] = pd.to_datetime(ccd_time['Talk Duration (AVG)']) 
ccd_time['Waiting Time (AVG)'] = pd.to_datetime(ccd_time['Waiting Time (AVG)'])

#Converting datetime values into decimals for the hours, minutes, and seconds in order to have useable statistical (float) data 
ccd_time['Answer Speed (AVG)'] =  ccd_time['Answer Speed (AVG)'].dt.hour * 60 + ccd_time['Answer Speed (AVG)'].dt.minute + ccd_time['Answer Speed (AVG)'].dt.second/60
ccd_time['Talk Duration (AVG)'] = ccd_time['Talk Duration (AVG)'].dt.hour * 60 + ccd_time['Talk Duration (AVG)'].dt.minute + ccd_time['Talk Duration (AVG)'].dt.second/60
ccd_time['Waiting Time (AVG)'] = ccd_time['Waiting Time (AVG)'].dt.hour * 60 + ccd_time['Waiting Time (AVG)'].dt.minute + ccd_time['Waiting Time (AVG)'].dt.second/60


# Verifying changes have worked in our transition 
ccd_time.dtypes




To complete the cleaning of the data, the '%' sign needs to be remove from both the Answer Rate and Service Level (20 Seconds) fields.  This is preventing the conversion of these fields from object (above) to a float (below). 

In [None]:
# Changing percentage columns -- Answer Rate and Service Level (20 Seconds) to a float type 
ccd_time['Answer Rate'] = ccd_time['Answer Rate'].str.replace('%','')
ccd_time['Answer Rate'] = ccd_time['Answer Rate'].astype(float)

ccd_time['Service Level (20 Seconds)'] = ccd_time['Service Level (20 Seconds)'].str.replace('%','')
ccd_time['Service Level (20 Seconds)'] = ccd_time['Service Level (20 Seconds)'].astype(float)

# Validating all changes complete
ccd_time.dtypes

Now that the data has been cleaned, I'll look for null values in the dataset. 

In [None]:
ccd_time.isnull().sum()

Since there are no null/nan values within the dataset, visualization of the data below starts with a high-level view of the various fields as a histogram.  This is helpful to determine where incoming call volume is going, and how the various fields are grouped (in terms of aggregate volume). 

In [None]:
plt.style.use('grayscale')
ccd_time.hist(figsize=(15,10))


Beyond the above, since the nulls and incorrect formatting has been corrected, I can now run a heatmap/correlogram using to determine more meaningful relationships between the various data fields.  

In [None]:
plt.figure(figsize=(15,10)) 
sns.color_palette("dark", as_cmap=True)
sns.heatmap(ccd_time.corr(), annot=True, cmap='coolwarm', center = 0)

The below helps to clearly indicate that wait time, rather than call volume, is a key driver in abandonment in calls.   While there is a relationship between these fields generally, overall, if there are a lot of calls answered quickly, there is less of an impact on abandonment.  Additionally, there is a significant increase in abandonment when wait time exceeds 15 minutes and call volume exceeds 800 calls. 

In [None]:
#Linear Regression - Incoming Calls and Abandonment 
x = np.array(ccd_time['Abandoned Calls']).reshape(-1,1)
y = np.array(ccd_time['Incoming Calls'])



mod = lr()
mod.fit(x,y)

mod = lr().fit(x,y)
r_sq = mod.score(x,y)

#print(r_sq)
#print(mod.intercept_)

y_pred = mod.predict(x)
#print(y_pred)

plt.figure(figsize=(15,10))
plt.scatter(x, y,c=y)
plt.plot(x, y_pred,c="red")
plt.xlabel('Abandoned Calls')
plt.ylabel('Incoming Calls')
plt.title('Incoming Calls and Abandonment')

In [None]:
#Multiple Regression Analysis - Abandoned Calls 
x1 = ccd_time.drop(['Abandoned Calls'],axis=1).values
y1 = ccd_time['Abandoned Calls'].values 


x_train, x_test,y_train, y_test = tts(x1,y1, test_size=0.2,random_state=0)
model = lr()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
#print(y_pred)

#Testing Model Accuracy 
pred2 = model.predict([[217,204,94.01,0.0,0.0,0.0,76.28]])
print(pred2)

from sklearn.metrics import r2_score
accuracy = r2_score(y_test,y_pred)
print(accuracy)

plt.figure(figsize=(15,10))
plt.scatter(y_test,y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs. Predicted Abandonment')

In [None]:
ccd_time.plot.scatter(x='Incoming Calls', y='Waiting Time (AVG)', c='Abandoned Calls', figsize=(15,10), title='Abandoned Incoming Calls by Average Wait Time', sharex=False)
plt.xlabel('Incoming Call Volume', size=12)
plt.ylabel('Average Wait Time', size=12)

Conversely, Service Level (SLA) and Average Speed of Answer (ASA) have an inverse relationship.  The below chart illustrates that as call volume increases, SLA decreases and ASA goes up.  In conjunction with the above graph, this clearly illustrates how these interact with high abandnment. 

In [None]:

ccd_time.plot.scatter(x='Service Level (20 Seconds)', y='Answer Speed (AVG)', c='Abandoned Calls', figsize=(15,10), sharex=False,
                     title='SLA, ASA, and Abandoned Calls')


In [None]:
plt.figure(figsize=(15,10))
plt.title('Wait Time and Incoming Calls')
sns.boxplot(x=ccd_time['Waiting Time (AVG)'], y=ccd_time['Incoming Calls'])
plt.xticks([])



Fortunately, the vast majority of incoming calls are above 60% SLA.  At a 60% SLA, the average call is answered in less than 1 minute.  Combining the data from the below and above graphs, this means virtually no calls are being abandoned at a 60% SLA, although ancillary impacts (i.e. customer experience) may be impacted although the call is not abandoned. 

Limitations of this dataset prevent a complete analysis of the impact of staffing and relationships with abandonment, volume, etc.  But based on findings, next steps would be to dig further (with an expanded data set) to determine root cause of calls falling beneath the 60% threshold, and determining acceptable business KPI's for this metric (SLA) to balance customer experience, staffing, and cost. 

In [None]:
plt.figure(figsize=(15,10))
sns.histplot(ccd_time, x='Service Level (20 Seconds)', bins=10)
plt.title('Contacts By SLA', size=15)
