In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns

In [None]:
data = pd.read_csv('Consumer_Complaints_train.csv')

In [None]:
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2014-05-15,Credit card,,Billing statement,,,,Wells Fargo & Company,MI,48342,Older American,,Web,2014-05-16,Closed with explanation,Yes,No,856103
1,2014-09-18,Bank account or service,(CD) Certificate of deposit,"Making/receiving payments, sending money",,,,Santander Bank US,PA,18042,,,Referral,2014-09-24,Closed,Yes,No,1034666
2,2014-03-13,Credit reporting,,Incorrect information on credit report,Account status,,,Equifax,CA,92427,,,Referral,2014-04-03,Closed with non-monetary relief,Yes,No,756363
3,2015-07-17,Credit card,,Billing statement,,"My credit card statement from US Bank, XXXX. X...",Company chooses not to provide a public response,U.S. Bancorp,GA,305XX,Older American,Consent provided,Web,2015-07-17,Closed with monetary relief,Yes,No,1474177
4,2014-11-20,Credit card,,Transaction issue,,,,Bank of America,MA,02127,,,Web,2014-11-28,Closed with explanation,Yes,No,1132572


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 478421 non-null  object
 1   Product                       478421 non-null  object
 2   Sub-product                   339948 non-null  object
 3   Issue                         478421 non-null  object
 4   Sub-issue                     185796 non-null  object
 5   Consumer complaint narrative  75094 non-null   object
 6   Company public response       90392 non-null   object
 7   Company                       478421 non-null  object
 8   State                         474582 non-null  object
 9   ZIP code                      474573 non-null  object
 10  Tags                          67206 non-null   object
 11  Consumer consent provided?    135487 non-null  object
 12  Submitted via                 478421 non-null  object
 13 

In [None]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    data[varname]=np.where(pd.isnull(data[col]),1,0)
    data.drop([col],1,inplace=True)

In [None]:
for col in ['Product','Submitted via','Company response to consumer','Timely response?']:
    
    temp=pd.get_dummies(data[col],prefix=col,drop_first=True)
    data=pd.concat([temp,data],1)
    data.drop([col],1,inplace=True)

In [None]:
for col in ['Date received','Date sent to company']:
    data[col]=pd.to_datetime(data[col],infer_datetime_format=True)
data['day_diff']=pd.to_numeric(data['Date sent to company']-data['Date received'])
for col in ['Date received','Date sent to company']:
    data.drop([col],1,inplace=True)

In [None]:
for col in ['ZIP code','Company']:
    data.drop([col],1,inplace=True)

In [None]:
k=data['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    data[varname]=np.where(data['State']==val,1,0)
del data['State']

In [None]:
k=data['Issue'].value_counts()
for val in k.axes[0][0:10]:
    varname='Issue_'+val.replace(',','_').replace(' ','_')
    data[varname]=np.where(data['Issue']==val,1,0)
del data['Issue']

In [None]:
data['Consumer disputed?']=np.where(data['Consumer disputed?']=="Yes",1,0)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 57 columns):
 #   Column                                                        Non-Null Count   Dtype
---  ------                                                        --------------   -----
 0   Timely response?_Yes                                          478421 non-null  uint8
 1   Company response to consumer_Closed with explanation          478421 non-null  uint8
 2   Company response to consumer_Closed with monetary relief      478421 non-null  uint8
 3   Company response to consumer_Closed with non-monetary relief  478421 non-null  uint8
 4   Company response to consumer_Closed with relief               478421 non-null  uint8
 5   Company response to consumer_Closed without relief            478421 non-null  uint8
 6   Company response to consumer_Untimely response                478421 non-null  uint8
 7   Submitted via_Fax                                             478421 non-n

In [None]:
cc_train, cc_test = train_test_split(data, test_size = 0.3,random_state=101)
x_train = cc_train.drop(['Consumer disputed?','Complaint ID'],1)
y_train = cc_train['Consumer disputed?']

x_test = cc_test.drop(['Consumer disputed?','Complaint ID'],1)
y_test= cc_test['Consumer disputed?']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

In [None]:
param_dist = {"n_estimators":[100,200,300,500,700,],
              "max_features": [5,10,20,25,30,35],
              "bootstrap": [True, False],
              'class_weight':[None,'balanced'], 
                'criterion':['entropy','gini'],
                'max_depth':[None,5,10,15,20,30,50],
                'min_samples_leaf':[1,2,5,10,15,20], 
                'min_samples_split':[2,5,10,15,20]
                  }

In [None]:
rfc = RandomForestClassifier()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
sm = SMOTE(random_state=42)

In [None]:
X = data.drop(['Consumer disputed?','Complaint ID'],1)
y = data['Consumer disputed?']

In [None]:
X, y = sm.fit_resample(X, y)

In [None]:
counter = Counter(y)
print(counter)

Counter({0: 376990, 1: 376990})


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
n_iter_search = 10

random_search = RandomizedSearchCV(rfc, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='roc_auc',cv=10, verbose=1)
random_search.fit(X_train, y_train)

In [None]:
random_search.best_estimator_

In [None]:
rf_new = 

In [None]:
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
roc_auc_score(y_test,predictions)

[[68093 56438]
 [29900 94383]]
              precision    recall  f1-score   support

           0       0.69      0.55      0.61    124531
           1       0.63      0.76      0.69    124283

    accuracy                           0.65    248814
   macro avg       0.66      0.65      0.65    248814
weighted avg       0.66      0.65      0.65    248814



0.6531078051579974