In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
# import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
from xgboost import XGBClassifier


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [None]:
data = pd.read_csv('../input/consumer-complaint-database/rows.csv')

# Analysing the Dataset

In [None]:
data.head()

In [None]:
print("Number of dimensions in dataset : ", data.ndim)
print("Dimensions of dataset : ", data.shape)
print("Number of Features in the dataset: ", len(data.columns))
print("List of features in the dataset: ")
for i in range(len(data.columns)):
    print(end = "\t")
    print(str(i + 1)+". "+data.columns[i])
    
print("Number of elements in each feature of dataset:\n", data.count()) #To get an idea if there are missing values.
print("\n\nCount of NaN values in each column:\n ",data.isna().sum())  #Counting number of missing values in each column

# Data Cleaning

In [None]:
# Dropping some less relevant columns

# column_list = ['State','ZIP code', 'Tags', 'Consumer consent provided?']
# data.drop(columns = column_list,inplace = True)
# data.dropna(inplace = True)

In [None]:
# Renaming Columns

data.columns = [col.strip() for col in data.columns]
data.columns = [col.replace('-',"_") for col in data.columns]
data.columns = [col.replace(' ',"_") for col in data.columns]
data.columns = [col.title() for col in data.columns]

data.rename(columns = {'Consumer_Complaint_Narrative':'Complaint', 'Company_Public_Response':'Response'}, inplace = True)
print(data.columns)


In [None]:
print("\n\nCount of NaN values in each column:\n ",data.isna().sum())  #Counting number of missing values in each column

In [None]:
data = data[data['Complaint'].notnull()]
print(data.shape)

In [None]:
# Number of Products

unique_ele = data['Product'].unique()
print("Number of unique elements in Product : ", len(unique_ele))
print("\n\nUnique elements in Product : ", unique_ele)

In [None]:
# Number of Complaints in each Product

temp_series = data.groupby('Product').size()
print(temp_series)

temp_series.sort_values(ascending = False, inplace = True)
print("\nAfter sorting in Descending Order :\n\n",temp_series)

In [None]:
# Selecting the Product for Analysis

#product_list = list(temp_series[2:7].index)  # Considering only 5 products.
product_list = ['Student loan' , 'Mortgage' , 'Credit reporting' , 'Bank account or service' , 'Credit card']

print("\n The Product list is : ", product_list)



In [None]:
data_orig = data.copy()

In [None]:
data = data.loc[data['Product'].isin(product_list)]
#print("\n\n The Dataset after filering out the Product list\n\n", data)

length  = len(data.Product)
print(length)
data.index = [np.arange(0,length)]

### There are 2 approaches to convert Categorical Values to Numerical Figures
       1. Label-Encoding(We are using this here.) : It can only be applied to target variable.
       2. One-Hot-Encoding : It is applied to Training variables.
       3. Find & Replace (Find Categorical Values & replace them with a Numerical value.)

In [None]:
# Creating LabelEncoder object 

encoder = LabelEncoder()
data['Product_Encoding'] = encoder.fit_transform(data['Product'])
data.head()
# WE will use encoder.inverse_transform(df['Product_Encoding'] to get back corresponding Product)
# encoder.classes_ to get list of products.

In [None]:
# Removing Stopwords and cleaning of Complaints

data['Complaint'] = data['Complaint'].astype(str)
data.dtypes

#data['Complaint'] = data['Complaint'].str.()

In [None]:
final_data = data[['Complaint' , 'Product' , 'Product_Encoding']]

print(final_data.shape)
final_data.to_csv('Complaint_data.csv' , index='False')

In [None]:
stop_words = set(stopwords.words('english'))

data['Filtered_Text'] = data['Complaint'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))
data.head()

In [None]:
# new_df = data.groupby('Product').head(7000)
# print(new_df['Product'].value_counts())      

# data = new_df

In [None]:
# Dividing the Dataset into Testing and Training set.

target = data['Product_Encoding']
X = data.drop(columns = ['Product_Encoding'])
print("\n List of Input : ",X.columns)
# print("\n Target : ",target.name)

X_train, X_test, target_train, target_test = train_test_split(X, target, test_size = 0.30, random_state = 25)
print(len(X))
print(len(X_train))
print(len(X_test))
print(len(target_train))
print(len(target_test))


In [None]:
vectorizer_1 = TfidfVectorizer(max_features = 5000)
Y = vectorizer_1.fit_transform(X_train['Filtered_Text'])

print("\n Shape : ", Y.shape)
vocab = vectorizer_1.get_feature_names() 
# print(vocab)



In [None]:
# vectorizer_2 = TfidfVectorizer()
Z = vectorizer_1.transform(X_test['Filtered_Text'].T)
print(" Shape : ", Z.shape)

In [None]:
# Training decision tree classifier
model_type_1 = DecisionTreeClassifier(max_depth = 10, random_state = 20)
model = model_type_1.fit(Y,target_train)
output = model.predict(Z)
confusion_matrix(target_test, output)
accuracy_score(target_test,output)

In [None]:
pd.Series(target_test).value_counts()

In [None]:
# Training Random Forest classifier
model_type_2 = RandomForestClassifier(n_estimators = 10, random_state = 20)
model = model_type_2.fit(Y,target_train)
output = model.predict(Z)
print(confusion_matrix(target_test, output))
accuracy_score(target_test,output)

In [None]:
# Training using Naive bayes classifier
model_type_3= GaussianNB()
model = model_type_3.fit(Y.toarray(),target_train)
output = model.predict(Z.toarray())
print(confusion_matrix(target_test, output))
accuracy_score(target_test,output)

In [None]:
# Training using Passive Aggressive classifier


model_type_4= PassiveAggressiveClassifier()
model = model_type_4.fit(Y,target_train)
output = model.predict(Z)
print(confusion_matrix(target_test,output))
accuracy_score(target_test,output)

In [None]:
# Training using XGBoost classifier


model_type_5= XGBClassifier(max_depth = 7, n_estimators = 50)
model = model_type_5.fit(Y,target_train)
output = model.predict(Z)
print(confusion_matrix(target_test,output))
accuracy_score(target_test,output)

In [None]:
New_output = pd.DataFrame(X_test.copy(deep = True))
New_output['Actual_Class'] = target_test
New_output['Predicted_Class'] = output

New_output.index = np.arange(1, New_output.shape[0] + 1)

# print(type(New_output))
# print(New_output.head())

print(New_output.shape)

print(New_output.tail())



In [None]:
data_2 = New_output.loc[New_output['Actual_Class'] != New_output['Predicted_Class']]
print("\nShape of data whose Actual & predicted class differs = " + str(data_2.shape))


data_2 = data_2.groupby('Actual_Class').head()
print("\n\nTaking 5 samples of each wrongly predicted complaint" + str(data_2.shape))

print(data_2[['Actual_Class', 'Predicted_Class']])
# print(data_2[['Complaint','Actual_Class', 'Predicted_Class']])