In [None]:
# necessary libraries 
import numpy as np 
import pandas as pd 
import datetime as dt

# for ploting and exploratory data analysis 
import seaborn as sns 
import matplotlib.pyplot as plt 

__Reading the data__

In [None]:
data = pd.read_csv("//kaggle//input//us-consumer-finance-complaints//consumer_complaints.csv", low_memory=False)  
data.head() 

<h2> Preprocessing</h2>

__Check for number of records__ 

In [None]:
print(data.shape) 

__Check for unique entries across data__

In [None]:
for col in data.columns: 
    print(col,":",data[col].nunique(dropna=True)) 

The column <em>"Complaint ID"</em> has all unique entries, we can use __Complaint ID__ as index

In [None]:
data = data.set_index("complaint_id") 
data.head() 

__Data Imputation__ 

In [None]:
# this function returns percentage of data missing from each column
def missing_summary(data): 
    total_rows = data.shape[0] 
    missing_rows = data.isnull().sum() 
    missing_summary = dict() 
    for i in range(0, missing_rows.shape[0]):
        missing_summary[missing_rows.index[i]] = round(missing_rows[i] / total_rows, 4) * 100
    return missing_summary 

missing_data = missing_summary(data) 

__Summary Table for Missing Data__ <br> 
<ul>Percentage of missing values help in making decisions like removing or replacing the data. 

In [None]:
missing_data = pd.DataFrame.from_dict(missing_data, orient="index", columns=["Percentage Missing"]).reset_index()
missing_data.sort_values(by="Percentage Missing", ascending=False)  

__Top column with missing data is__ <em>Tags</em>

In [None]:
data.tags.value_counts()
plt.figure(figsize=(11, 6))
sns.countplot(data.tags) 
plt.show() 

__Background information__ 
<ul><li>Data that supports easier searching and sorting of complaints submitted by or on behalf of consumers</li>
    <li> The services for Older Americans and Servicemen differ from typical customer</li> 
    <li>The tags can't be removed from the data as it gives vital information about customer and the type of service they receive.</li>
</ul>
&nbsp;<a>https://www.consumerfinance.gov/practitioner-resources/resources-for-older-adults/</a>

__Plan of Action__ 
<ul><li>Replace missing values with category "Others"</li>
    <li>Each label to be encoded post visualization</li></ul> 

In [None]:
data["tags"] = data.tags.replace(to_replace=np.nan, value="Others") 
data.tags[:10]

In [None]:
data.tags.value_counts()
plt.figure(figsize=(11, 6))
sns.countplot(data.tags) 
plt.show() 

<hr>

<h4>Consumer consent provided?</h4>
<br>
<strong>Background information</strong> 
<ul><li>Whether a consumer opted in to publish their complaint narrative</li> 
    <li>This column has no bearing on the analysis, mostly concerned with privacy</li>
    </ul>
    <strong> Plan of Action</strong> 
    <ul><li>This column is to be removed</li></ul>
    

In [None]:
data["consumer_consent_provided"].unique() 

In [None]:
data.drop("consumer_consent_provided", axis=1, inplace=True) 

<hr>

__Consumer complaint narrative__
<ol><li> This is a transcript or summary of the complaint by the consumer, provides detailed description for complex cases</li> 
<li> Although this text can be processsed and top consumer grievances can be extrapolated, the same information overview can be gathered from data within other fields like <em>"Issue" and "Sub-Issue"</em></li>
    <li>The objective is to perform Cluster analysis, extracting features from consumer narrative when 83% of data is missing is not ideal </li></ol>  
<strong>Plan of Action</strong> 
<ul>This column is to be removed</ul>

In [None]:
data["consumer_complaint_narrative"].unique() 

In [None]:
data.drop("consumer_complaint_narrative", axis=1, inplace=True) 

<hr>

<h4> Company public response</h4>
<br>
<strong>Background information</strong> 
<ul><li>A company response comprises of one of ten standard responses</li> 
    <li>Company response for 78% of the cases is missing, implies that a bank doesn't issue a response for majority of the complaints</li>
    </ul>
    <strong> Plan of Action</strong> 
    <ul><li>Replace missing values with <em>"No Response"</em></li>
    <li>Encode each category to check if the <em>Company response</em> has any bearing on complaints being disputed by the consumer</li></ul>
    Note: No Response is different from cases where <em>"Company chooses not to provide a public response"</em> which is an official position of said company. 

In [None]:
data["company_public_response"].value_counts()

In [None]:
# total number of complaints where a company chose to respond
data["company_public_response"].value_counts().sum() 

<h5>The following is a visual break up to complaints where a company issued a response</h5>

In [None]:
sizes = data["company_public_response"].value_counts()
labels = ['Responded to consumer and CFPB', 'No public response', 'Company acted within law', 'Misunderstanding',
          'Disputes the facts', 'Actions of third party', 'Isolated error', "Can't verify the facts", 
          'Room for improvement in service', 'Discontinued policy']

cmap = plt.get_cmap("tab20c") 
colors = cmap(np.arange(10) * 2)
fig1, ax1 = plt.subplots()
fig1.set_size_inches(10,10)
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, colors=colors, labeldistance=1.05,  
        textprops={'fontsize': 14})
ax1.axis('equal')
plt.show()

In [None]:
# replacing null values in Company public response with "No response" 
data["company_public_response"].fillna("No response", inplace=True) 

In [None]:
# checking if any null entries left
print("Missing entries in column -", data["company_public_response"].isnull().sum())  
print("Company public response") 
data["company_public_response"].value_counts() 

<hr>

In [None]:
data[data["consumer_disputed?"].isnull()]["company_public_response"].value_counts() 

In [None]:
data[data.state.isnull()] 

<hr>

<h4> Location: State and Zipcode</h4>
<br>
<strong>Background information</strong> 
<ul><li>One percent of the total complaints do not have the location information</li><li>Location includes all 50 states and military, commonwealths, and territories in United States of America</li><a>https://www.50states.com/abbreviations.htm</a> </ul>
    <strong> Plan of Action</strong> 
    <ul><li>Further analysis is needed wheather location of consumer plays a role in resolving a complaint</li>
    <li><strong><em>If we can prove complaints are treated equally regardless of their Location, then Location information is not needed for analysis</li>
        <li>Remove less than one percent of data with missing location</li></ul>

In [None]:
# check for number of missing rows
data.state.isnull().sum() 

In [None]:
Statewise_Product_complaints = data.groupby("state")[["product"]].agg('count') 
Statewise_Product_complaints = Statewise_Product_complaints.sort_values("product", ascending=False)

# ploting statewise product usage
plt.figure(figsize=(19,19))
sns.barplot(x="product", y=Statewise_Product_complaints.index, data=Statewise_Product_complaints, palette="Blues_d") 
plt.title("Statewise Complaints")
plt.show()

In [None]:
Products_Across_State = pd.crosstab(data["state"], data['product'], normalize="index") 

In [None]:
Products_Across_State = Products_Across_State.T
Products_Across_State.head() 

In [None]:
plt.figure(figsize=(20,20))
yticks = Products_Across_State.index
keptticks = yticks[::int(len(yticks)/10)]
yticks = ['' for y in yticks]
yticks[::int(len(yticks)/10)] = keptticks

xticks = Products_Across_State.columns
keptticks = xticks[::int(len(xticks)/10)]
xticks = ['' for y in xticks]
xticks[::int(len(xticks)/10)] = keptticks

sns.heatmap(Products_Across_State, yticklabels=yticks, xticklabels=xticks, square=True, 
            cbar_kws={'fraction' : 0.01}, cmap='OrRd', linewidth=1.5)

# This sets the yticks "upright" with 0, as opposed to sideways with 90.
plt.yticks(rotation=0) 

plt.show()

<ul>
    <li>The heatmap of proportion of <em>financial Products</em> across all states is similar except for Mortgage </li>
    <li>Further proof can be realized by Chi squared testm to find out if all the groups are similar or not</li>
</ul>

In [None]:
from scipy.stats import chi2_contingency 

chi_stat, p_value, dof, e_table = chi2_contingency(Products_Across_State)  
print("Chi Statistic = ", round(chi_stat, 3))  
print("P-value =", p_value) 

<ul>
   <li>P-values suggests that we must reject the null hypothesis that is usage of products across different states may not be the same/li>
<li><strong>Given the similar proportions, we would expect the test to find that the groups are similar and that the variables are independent (fail to reject the null hypothesis, or H0).</strong></li>
    <li><em>All financial products are used similarly across all states</em></li>
    </ul>
    

In [None]:
e_table[:1]

<ul>
<li>If all products are used irrespective of states, logically it follows that <strong>issues arising from these products are also similar across states</strong></li>
</ul>

In [None]:
data.issue.value_counts()

In [None]:
Statewise_Issues = pd.crosstab(data["state"], data["issue"]) 
Statewise_Issues

In [None]:
chi_stat, p_value, dof, e_table = chi2_contingency(Statewise_Issues)  
print("Chi Statistic = ", chi_stat) 
print("P-value =", p_value) 

<ul><li>Contrary to our intitial assumption, location is not an independent factor when considering the Issue of the complaint</li></ul>
<h5>Verdict</h5> 
The State column is vital for data analysis, it cannot be removed

<h4> Location: Zipcode</h4>
<br>
<strong>Background information</strong> 
<ul><li>The granularity of location can be analyzed on basis of State, localiztion to specific zipcode can be eliminated for simplification in analysis</li> </ul>
    <strong> Plan of Action</strong> 
    <ul><li>Zipcodes are to be removed</li></ul>

In [None]:
data.drop("zipcode", axis=1, inplace=True) 
data.columns

In [None]:
# converting string to datetime 
data["date_received"] = pd.to_datetime(data["date_received"]) 
data["date_sent_to_company"] = pd.to_datetime(data["date_sent_to_company"])  

In [None]:
data["Forwarding_time"] = data["date_sent_to_company"] - data["date_received"]  
data.head() 

<h4> Dates - Recieved and Sent to the company</h4>
<br>
<strong>Background information</strong> 
<ul><li>These dates indicate the date at which the complaint has been forwarded to CFPB and not the date at which complaint has been filed with the Bank or the organization. </li> 
    <li>The Timely Response? column indicates wheather company has reponded or not, the date at which a third party is made aware of the complaint has no bearing on the complaint itself. </li>
<li>Further more our derived column Forwarding time shows that these dates have no effect on resolution or timely response variables</li></ul>
    <strong> Plan of Action</strong> 
    <ul><li>All dates are to be removed</li></ul>

In [None]:
data.drop(["date_received", "date_sent_to_company", "Forwarding_time"], axis=1, inplace=True)
data.columns

<h4>Sub-Product and Sub-Issue</h4>
<br>
<strong>Background information</strong> 
<ul><li>The specificity of product and issue is available for less than 40 percent of the data </li> 
    <li>Simplification of these categories can be achieved by grouping, which is already done with Product and Issue</li></ul>
    <strong> Plan of Action</strong> 
    <ul><li>Both columns are to be removed</li></ul>

In [None]:
data.drop(["sub_product", "sub_issue"], axis=1, inplace=True)
data.columns

<h4>Consumer Disputed? </h4>
<br>
<strong>Background information</strong> 
<ul><li>6 percent of the data is missing</li> </ul>

<strong> Plan of Action</strong>
<ul><li>Imputation can be done by filling the most common responses</li></ul>

In [None]:
common_response = data["consumer_disputed?"].mode()
common_response = "No"
# replacing with most common consumer response
data["consumer_disputed?"].fillna(common_response, inplace=True)

<h3> Timely Response per Product</h4>

In [None]:
plt.figure(figsize=(21, 6)) 
chart = sns.countplot(data[data["timely_response"] == "Yes"]["product"], palette='Set1')    
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', fontweight='light', fontsize='x-large')
plt.title("Timely Response? Yes")
plt.show() 

In [None]:
plt.figure(figsize=(21, 6)) 
chart = sns.countplot(data[data["timely_response"] == "No"]["product"], palette='Set1')    
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', fontweight='light', fontsize='x-large')
plt.title("Timely Response? No")
plt.show() 

<h4>Submitted Via </h4>
<br>
<strong>Background information</strong> 
<ul><li>Indicates the medium through which consumer contacted CPFB, has no bearing on analysis</li> </ul>

<strong> Plan of Action</strong>
<ul><li>Column to be removed</li></ul>

In [None]:
submitted = data["submitted_via"].value_counts() 
itr = 0
for i in submitted:
    print(submitted.index[itr], round(i / data.shape[0], 4))  
    itr += 1 

In [None]:
not_timely = data[data["timely_response"] == "No"] 
submitted = not_timely["submitted_via"].value_counts() 
itr = 0
for i in submitted:
    print(submitted.index[itr], round(i/data.shape[0], 4))  
    itr += 1                    

In [None]:
timely = data[data["timely_response"] == "Yes"] 
submitted = timely["submitted_via"].value_counts() 
itr = 0
for i in submitted:
    print(submitted.index[itr], round(i/data.shape[0], 4))  
    itr += 1  

In [None]:
data.drop("submitted_via", axis=1, inplace=True)

In [None]:
# revaluating missing data
missing_data = missing_summary(data)  
missing_data = pd.DataFrame.from_dict(missing_data, orient="index", columns=["Percentage Missing"]).reset_index()
missing_data.sort_values(by="Percentage Missing", ascending=False)  

In [None]:
data.dropna(inplace=True)  

In [None]:
# revaluating missing data
missing_data = missing_summary(data)  
missing_data = pd.DataFrame.from_dict(missing_data, orient="index", columns=["Percentage Missing"]).reset_index()
missing_data.sort_values(by="Percentage Missing", ascending=False)  

In [None]:
# encoding the data for clustering
cols = data.columns
for col in cols:
    data[col]=data[col].astype('category')

encoded_data = pd.get_dummies(data[cols], columns=cols)

<h4>Creating a random sample of the encoded dataset</h4> 
<p> For reduced computation and easier clustering <br>Typical sampling problems class imbalance and representative sample are addressed by converting each category into encoded features</p>

In [None]:
encoded_data.shape

<h4>Kmeans Algorithm</h4> 
<em>The Elbow Method to find the number of Optimal Clusters</em>

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

wcss = []
score = [] 

for i in range(2, 11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    cluster_labels = km.fit_predict(encoded_data)
    wcss.append(km.inertia_)
    sil_scr = silhouette_score(encoded_data, cluster_labels)
    score.append(sil_scr)
    
plt.plot(range(2, 11), wcss)
plt.title('The Elbow Method', fontsize = 20)
plt.xlabel('No. of Clusters')
plt.ylabel('wcss')
plt.show()