In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# import required libraries
import numpy as np
import pandas as pd
import datetime as dt

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for Interactive Shells
from IPython.display import display

# setting up the chart size and background
plt.rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')

# For words Preprocessesing
import nltk
import re
from bs4 import BeautifulSoup as bs
nltk.download('stopwords')
from nltk.corpus import stopwords


In [None]:
consumer_affairs_complaints = pd.read_csv('../input/comcastcomplaints/comcast_consumeraffairs_complaints.csv')
complaint_2015 = pd.read_csv('../input/comcastcomplaints/comcast_fcc_complaints_2015.csv')

In [None]:
display("Dataset : Comcast Consumer Affairs Complaints")
display(consumer_affairs_complaints.head())
print()
display("Dataset : comcast fcc complaints 2015")
display(complaint_2015.head())

In [None]:
display("Shape of Dataset : Comcast Consumer Affairs Complaints")
display(consumer_affairs_complaints.shape)
print()
display("Shape of Dataset : comcast fcc complaints 2015")
display(complaint_2015.shape)

In [None]:
display("Information of Dataset : Comcast Consumer Affairs Complaints")
display(consumer_affairs_complaints.info())

display("Information of Dataset : comcast fcc complaints 2015")
display(complaint_2015.info())

<div class="alert alert-danger" role="alert"> <p style="font-weight: bold;font-size:20px;color:#2025bd">Observation: </p></div>
<div class="alert alert-success" role="primary"><p style=";font-size:15px;color: #2025bd">There are two datasets.</p>
<p style=";font-size:15px;color: #2025bd">We can perform sentimental analysis on "Comcast Consumer Affairs Complaints" dataset.</p> 
<p style=";font-size:15px;color: #2025bd">For "Comcast fcc complaints 2015" dataset we can Narrow down the customer complaints to 5 different Labels, which will lead to gain better insights on customer complaint without having to go through them one-by-one for what each customer is complaining about.</p> 
<p style=";font-size:15px;color: #2025bd">No null values.</p>
</div>


<div class="alert alert-danger" role="alert"> <p style="font-weight: bold;font-size:25px;color:#2025bd">Data Preprocessing </p></div>

In [None]:
# Create a new col Date_Time in Comcast fcc complaints 2015 (df2)
complaint_2015["Date"] = pd.to_datetime(complaint_2015["Date"])
complaint_2015['month'] = complaint_2015['Date'].dt.month
complaint_2015['month_day'] = complaint_2015['Date'].dt.strftime('%m-%d')


# Let's check data types now
display(complaint_2015.dtypes)

<div class="alert alert-success" role="primary"><p style="font-weight: bold;font-size:20px;color: #2025bd">Trend chart for the number of complaints at monthly granularity level.</p>
</div>

In [None]:
display(complaint_2015[['Customer Complaint', 'month']].groupby(['month']).agg('count').sort_values(by = 'Customer Complaint', ascending = False).style.background_gradient(cmap =  'Reds'))

# Complaints received Monthly Basis
daily_complaints =complaint_2015[['Customer Complaint', 'month']].groupby('month').count().reset_index()
sns.lineplot(daily_complaints['month'], daily_complaints['Customer Complaint'])
plt.title("Time Series for Complaints on Monthly basis", fontsize = 20)
plt.xlabel("Months")
plt.show();

<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Observations: </p> 
</div>
<div class="alert alert-success" role="primary"><p style="font-size:15px; color:#2025bd" > 6th month ('June') observed the heighest complaints from the customers. 1281 being maximum.</p> 
<p style="font-size:15px; color:#2025bd" > 5th month ('May') observed lowest complaints from the customers. 399 being the minimum.</p> 
    <p style="font-size:15px; color:#2025bd" > Enormous increase in compaints observed from May to June. Precisely 882 complaints</p>
</div>

<div class="alert alert-success" role="primary"><p style="font-weight: bold;font-size:20px;color: #2025bd">Trend chart for the number of complaints at daily granularity levels.</p>
</div>

In [None]:
display("20 Days with most Complaints")
display(complaint_2015.groupby('month_day').count()[['Customer Complaint']].reset_index().sort_values(by = 'Customer Complaint',ascending = False).head(5).style.background_gradient('Blues'))

display("20 Days with Least Complaints")
display(complaint_2015.groupby('month_day').count()[['Customer Complaint']].reset_index().sort_values(by = 'Customer Complaint').head(10).style.background_gradient('Oranges'))

daily_complaints =complaint_2015[['Customer Complaint','month_day']].groupby('month_day').count().reset_index()
sns.lineplot(daily_complaints['month_day'], daily_complaints['Customer Complaint'])
plt.title("Time Series for Complaints on Daily basis", fontsize = 20)
plt.xticks(fontsize = 10, rotation = 90)
plt.xlabel("Days")
plt.ylabel("Complaints")
plt.show();

<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Observations: </p> 
</div>
<div class="alert alert-success" role="primary"><p style="font-size:15px; color:#2025bd" > Above Linechart shows the trend of complaints over days displaying each day in format mont_day.</p> 
<p style="font-size:15px; color:#2025bd" > As expected from the trend chart of month, days in 6th month ('June') observe the maximum sailes.</p> 
    <p style="font-size:15px; color:#2025bd" > We can further observe that complaints were maximum during 23 June to 25 June. 507 complaints being maximum</p>
    <p style="font-size:15px; color:#2025bd" > Complaints were minimum on 11th April. Exact count being 5.</p>
</div>



<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" >Create a new categorical variable with value as Open and Closed. Open & Pending is to be categorized as Open and Closed & Solved is to be categorized as Closed.
</p> 
</div>

In [None]:
complaint_2015['Status'].replace({'Solved':'Closed','Pending':'Open'}, inplace= True)

display('Unique Values in Column Status: ',complaint_2015['Status'].nunique())
complaint_2015[['Status']].value_counts().to_frame().reset_index().rename({0: "Count"}, axis=1).style.background_gradient('crest')


<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Table with the frequency of complaint types. Which complaint types are maximum i.e., around internet, network issues, or across any other domains.</p> 
</div>

In [None]:
# Creating a function substitute the words such as won't to will not etc...
def decontracted(phrase):
    """input: some sentence - not preprocessed sentance basically.
       Output: processes the words such as "won't" to 'will not' etc. 
    """
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"did't", "did not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'s", "", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# Let's try this function on some complaint
display(decontracted("comcast won't cancel my service"))
display(decontracted("comcast does not disclose the internet speed i'm paying for anywhere on my bill or online"))

In [None]:
# creating a function for preprocessing complaints 
def preprocess_complaints(comp):
    """input: Takes in complaints (Sting type sentences)
       output: Returns preprocessed version of complaints these steps consists of lower case, 
       decontracted(function mentioned above), remove speacial characters
    """
    comp = bs(comp, 'lxml').get_text()  # using  beatifulSoup4 to extract complaints in one variables
    comp = comp.lower() #lower case the complaint
    comp = decontracted(comp) # using custom function to replace words like won't to will not etc
    comp = re.sub('[^A-Za-z]+', ' ', comp) #remove any special character.
    comp = " ".join(e for e in comp.split() if e not in set(stopwords.words('english'))) #joining everything what we did till now
    return comp

# applying the fucntion preprocess_complaints to data['Customer Complaint'] (orginal dataset) 
# and assinging it to a new column in the original dataset:
complaint_2015['Preprocessed_complaints']=complaint_2015['Customer Complaint'].apply(lambda x : preprocess_complaints(x))


In [None]:
# Creating a function for assigning complaint_types to different complaints raised by the users.
def complaint_type(row):
    if ("internet" in row['Preprocessed_complaints']) or ("data" in row['Preprocessed_complaints']) or ("cap" in row['Preprocessed_complaints']):
            return 'Internet Issues'
    elif("network" in row['Preprocessed_complaints']) or ("speed" in row['Preprocessed_complaints']):
            return "Network/Speed Issues"
    elif("service" in row['Preprocessed_complaints']) or ("bill" in row['Preprocessed_complaints']) or ("customer" in row['Preprocessed_complaints']):
            return "Customer_Services Issues"
    elif("comcast" in row['Preprocessed_complaints']):
            return "Comcast Issues"
    else:
            return "Others"
        
# applying the function complaint_type() to our original dataset and assigning the results to a new column in original dataset  
complaint_2015['Complaint_Type'] = complaint_2015.apply(lambda row: complaint_type(row), axis=1).to_frame()

# Let's check what we did till now
display(complaint_2015.head()[['Customer Complaint', 'Preprocessed_complaints', 'Complaint_Type']])

# Displaying a table showing different complaint types and respective counts
display(complaint_2015.groupby(by = 'Complaint_Type')[['Customer Complaint']].count().reset_index().rename({'Customer Complaint': "Count"}, axis=1).sort_values(by = 'Count', ascending = False).style.background_gradient("copper_r"))

# Plotting the Complaint types and count
sns.countplot(complaint_2015['Complaint_Type'],palette='copper')
plt.title("Count of Types of complaints", fontsize = 20)
plt.xticks(fontsize = 15)
plt.xlabel(" ")
plt.ylabel("Complaints Counts")
plt.show()

<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Observations: </p> 
</div>
<div class="alert alert-success" role="primary"><h3 style="font-size:15px; color:#2025bd" > 1. Above results, in order, shows table with  </h3>
<p style="font-size:15px; color:#2025bd" > - complaints, preprocessed complaints and complaint types. </p> 
<p style="font-size:15px; color:#2025bd" > - Complaint_types and Counts</p> 
<p style="font-size:15px; color:#2025bd" > - Countplot of Complaint_Type</p> 
    <p style="font-size:15px; color:#2025bd" > 2. We see that {Type:Internet issues} have the maximum complaints. Exact count being 751.</p>
    <p style="font-size:15px; color:#2025bd" > 3. {Type:Network/Speed issues} have the minimum complaints when compared to other types. Exact count 85.</p>
</div>

<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > {Type:Internet issues} consists of words: {internet, data, cap}. Hence we could say most of customers have problem around internet, data and cap(capacity).</p> 
</div>


In [None]:
# Finally we can even get percentage of the complaint types, by following percentage
def get_simple_topic_percentage(topic):
    """
    Returns a percentage of rows that this particular topic is found
    in using simple string manipulation.
    
    topic: the customer complaint category entered by the customer.
    """
    return round(complaint_2015[complaint_2015['Complaint_Type'].str.contains(topic, case=False)].shape[0] / len(complaint_2015['Complaint_Type']) * 100, 2)
    


# Creating a dictionary of different issues and its respective percentage.
percentage = {'Internet Issues': [get_simple_topic_percentage('Internet Issues')], 'Customer_Services Issues':[get_simple_topic_percentage('Customer_Services Issues')], 
        'Comcast Issues': [get_simple_topic_percentage('Comcast Issues')], 'Others':[ get_simple_topic_percentage('Others')], 'Network/Speed Issues': [get_simple_topic_percentage('Network/Speed Issues')]}
 
# Create the pandas DataFrame
df_percentage = pd.DataFrame(percentage).transpose().rename({0: 'Percentage'}, axis =1).style.background_gradient("copper_r")


df_percentage

<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > State wise status of complaints in a stacked bar chart</p> 
</div>


In [None]:
display('Highest Complaints State wise:')
display(complaint_2015.groupby(['State']).size().sort_values(ascending = False).to_frame().head(6).reset_index().rename({0: "Count"}, axis = 1).style.background_gradient('bone_r'))
print('\n')
display('Lowest Complaints State wise:')
display(complaint_2015.groupby(['State']).size().sort_values().to_frame().head(6).reset_index().rename({0: "Count"}, axis = 1).style.background_gradient('Purples_r'))



<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Observations: </p> 
</div>
<div class="alert alert-success" role="primary"><p style="font-size:15px; color:#2025bd" > 1. Above Tables show the Highest and Lowest Complaints statewise. Not taking in account that complaint is still open or aready closed.  </p>
<p style="font-size:15px; color:#2025bd" >2. We see that Gorgia Florida and California has the highest complaints, count being 289, 240, 220 respectively.</p> 
<p style="font-size:15px; color:#2025bd" >3. Naveda Iowa Montana Rhode islands District of Columbia has the lowest complaints count being 1 for each</p> 
<p style="font-size:15px; color:#2025bd" > 4. We need better tables which shows even the open and closed complaints separately to see what actually is going on behind these numbers.</p> 
</div>


In [None]:
display('Highest Open and Closed Complaints State wise:')
display(complaint_2015.groupby(["State","Status"]).size().unstack().sort_values(by='Closed', ascending = False).fillna(0).head(7).style.background_gradient('bone_r'))
print('\n')
display('Lowest Open and Closed Complaints State wise:')
display(complaint_2015.groupby(["State","Status"]).size().unstack().sort_values(by='Closed').fillna(0).head(7).style.background_gradient('Blues'))
print('\n')
Status_complaints=complaint_2015.groupby(["State","Status"]).size().unstack().fillna(0)
Status_complaints.plot(kind="bar", stacked=True, colormap='copper')
plt.title("Stacked bar chart for State wise status of complaints", fontsize = 20)
plt.xticks(fontsize = 15)
plt.xlabel("States")
plt.ylabel("Complaints_Stacked")
plt.show();

<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Observations: </p> 
</div>
<div class="alert alert-success" role="primary"><p style="font-size:15px; color:#2025bd" > 1. Above two tables shows the Highest and Lowest Open and Closed complaints state wise. </p>
<p style="font-size:15px; color:#2025bd" >2. We see that Geogia is highest for both Open and Closed complaints. </p> 
<p style="font-size:15px; color:#2025bd" >3. Stacked Bar Chart clearly shows the closed complaints are more than open complaints overall. </p> 
<p style="font-size:15px; color:#2025bd" >4. We also observe that Florida has good closing rate in terms of complaints as they have 201 Closed complaints and only 39 open complaints.</p> 
</div>

<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Which state has the maximum complaints</p> 
</div>

In [None]:
display('State with Highest Complaints:')
display(complaint_2015.groupby(['State']).size().sort_values(ascending = False).to_frame().head(1).reset_index().rename({0: "Count"}, axis = 1).style.background_gradient('Reds_r'))

In [None]:
display('State with Highest Open and Closed Complaints:')
display(complaint_2015.groupby(["State","Status"]).size().unstack().sort_values(by='Closed', ascending = False).fillna(0).head(1).style.background_gradient('Reds_r'))


<div class="alert alert-danger" role="primary"><p style="font-weight: bold; font-size:20px; color:#2025bd" > Which state has the highest percentage of unresolved complaints</p> 
<p style="font-weight: bold; font-size:20px; color:maroon" >Formula used : (number of complaints/total complaints) * 100</p> 
</div>

In [None]:
complaints = complaint_2015.groupby(['Status', 'State']).size().unstack().fillna(0).transpose().reset_index()[['State','Open']]


complaints['percentage'] = (complaints['Open'] / complaints['Open'].sum()) * 100
display("State with highest percentage of unresolved complaints")
complaints.sort_values(by ='percentage', ascending = False ).head(1).style.background_gradient('Reds_r')