In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading the CSV file 
train = pd.read_csv(r'D:\Data Science\test & train\project 1_consumer services\Consumer_Complaints_train.csv')
test = pd.read_csv(r'D:\Data Science\test & train\project 1_consumer services\Consumer_Complaints_test_share.csv')

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
# Checking the data type
print(train.info(),test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 478421 non-null  object
 1   Product                       478421 non-null  object
 2   Sub-product                   339948 non-null  object
 3   Issue                         478421 non-null  object
 4   Sub-issue                     185796 non-null  object
 5   Consumer complaint narrative  75094 non-null   object
 6   Company public response       90392 non-null   object
 7   Company                       478421 non-null  object
 8   State                         474582 non-null  object
 9   ZIP code                      474573 non-null  object
 10  Tags                          67206 non-null   object
 11  Consumer consent provided?    135487 non-null  object
 12  Submitted via                 478421 non-null  object
 13 

In [5]:
# Adding new columns in both dataframe
train['Data'] = 'train'
test['Data'] = 'test'

In [6]:
# moving No_Show column to end
column_to_move = train.pop("Consumer disputed?")

# insert column with insert(location, column_name, column_value)
train.insert(18, "Consumer disputed?", column_to_move)

In [7]:
train.shape, test.shape

((478421, 19), (119606, 18))

In [8]:
train.columns, test.columns

(Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
        'Consumer complaint narrative', 'Company public response', 'Company',
        'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
        'Submitted via', 'Date sent to company', 'Company response to consumer',
        'Timely response?', 'Complaint ID', 'Data', 'Consumer disputed?'],
       dtype='object'),
 Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
        'Consumer complaint narrative', 'Company public response', 'Company',
        'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
        'Submitted via', 'Date sent to company', 'Company response to consumer',
        'Timely response?', 'Complaint ID', 'Data'],
       dtype='object'))

In [9]:
# Concat train and test data
df = pd.concat([train,test], axis=0)

In [10]:
df.shape

(598027, 19)

In [11]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Complaint ID', 'Data', 'Consumer disputed?'],
      dtype='object')

In [12]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Complaint ID,Data,Consumer disputed?
0,2014-05-15,Credit card,,Billing statement,,,,Wells Fargo & Company,MI,48342,Older American,,Web,2014-05-16,Closed with explanation,Yes,856103,train,No
1,2014-09-18,Bank account or service,(CD) Certificate of deposit,"Making/receiving payments, sending money",,,,Santander Bank US,PA,18042,,,Referral,2014-09-24,Closed,Yes,1034666,train,No
2,2014-03-13,Credit reporting,,Incorrect information on credit report,Account status,,,Equifax,CA,92427,,,Referral,2014-04-03,Closed with non-monetary relief,Yes,756363,train,No
3,2015-07-17,Credit card,,Billing statement,,"My credit card statement from US Bank, XXXX. X...",Company chooses not to provide a public response,U.S. Bancorp,GA,305XX,Older American,Consent provided,Web,2015-07-17,Closed with monetary relief,Yes,1474177,train,No
4,2014-11-20,Credit card,,Transaction issue,,,,Bank of America,MA,02127,,,Web,2014-11-28,Closed with explanation,Yes,1132572,train,No


In [13]:
# converting object type to datetime
for col in ['Date received', 'Date sent to company']:
    df[col] = pd.to_datetime(df[col],infer_datetime_format=True)

In [14]:
# Adding a column which gives total diffence between days
df['day_diff']=pd.to_numeric((df['Date sent to company'] - df['Date received']).dt.days)

In [15]:
# day_diff cannot be less than 0
df[(df['day_diff'] < 0)].day_diff.value_counts()

-1    6932
Name: day_diff, dtype: int64

In [16]:
df['day_diff'].describe()

count    598027.000000
mean          4.444398
std          16.605775
min          -1.000000
25%           0.000000
50%           1.000000
75%           4.000000
max         993.000000
Name: day_diff, dtype: float64

In [17]:
df['day_diff'][df['day_diff'] < 0] = df['day_diff'].mean()

In [18]:
df['day_diff'].describe()

count    598027.000000
mean          4.507506
std          16.595307
min           0.000000
25%           0.000000
50%           1.000000
75%           4.000000
max         993.000000
Name: day_diff, dtype: float64

In [19]:
# Drop unwanted columns
for col in ['Date received','Date sent to company']:
    df.drop([col],1,inplace=True)

In [20]:
df.isnull().sum()

Product                              0
Sub-product                     173225
Issue                                0
Sub-issue                       365685
Consumer complaint narrative    504376
Company public response         484859
Company                              0
State                             4764
ZIP code                          4774
Tags                            513950
Consumer consent provided?      428676
Submitted via                        1
Company response to consumer         0
Timely response?                     0
Complaint ID                         0
Data                                 0
Consumer disputed?              119606
day_diff                             0
dtype: int64

In [21]:
df.dtypes

Product                          object
Sub-product                      object
Issue                            object
Sub-issue                        object
Consumer complaint narrative     object
Company public response          object
Company                          object
State                            object
ZIP code                         object
Tags                             object
Consumer consent provided?       object
Submitted via                    object
Company response to consumer     object
Timely response?                 object
Complaint ID                      int64
Data                             object
Consumer disputed?               object
day_diff                        float64
dtype: object

In [22]:
df.columns

Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Company response to consumer', 'Timely response?',
       'Complaint ID', 'Data', 'Consumer disputed?', 'day_diff'],
      dtype='object')

In [23]:
# Here less significant columns are dropped

df = df.drop(['Company','Sub-product', 'ZIP code', 'Sub-issue','Consumer complaint narrative','Company public response'],axis=1)

In [24]:
df.columns

Index(['Product', 'Issue', 'State', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Company response to consumer', 'Timely response?',
       'Complaint ID', 'Data', 'Consumer disputed?', 'day_diff'],
      dtype='object')

In [25]:
# .fillna(X_train[i].mode()[0]) is used to impute missing values with the mode(most frequent value) in a dataframe.
# col_train represents the list of column names which have null values

col_train= ['State', 'Tags','Consumer consent provided?']

for i in col_train:
    if df[i].isnull().sum()!= 0:
        df[i]=df[i].fillna(df[i].mode()[0])

In [26]:
df.isnull().sum()

Product                              0
Issue                                0
State                                0
Tags                                 0
Consumer consent provided?           0
Submitted via                        1
Company response to consumer         0
Timely response?                     0
Complaint ID                         0
Data                                 0
Consumer disputed?              119606
day_diff                             0
dtype: int64

In [27]:
# The get_dummies() function is used to convert categorical variable into dummy/indicator variables.
# col_train represents list of column names which have categorical values.

col_train= ['Product','Issue', 'State', 'Tags','Consumer consent provided?', 
            'Submitted via', 'Company response to consumer','Timely response?']
for i in col_train:
    dummy= pd.get_dummies(df[i],prefix=i)
    dummy= dummy.iloc[:,:-1]
    df= pd.concat([df,dummy],axis=1)
    df=df.drop([i],1)

In [28]:
df.head()

Unnamed: 0,Complaint ID,Data,Consumer disputed?,day_diff,Product_Bank account or service,Product_Consumer Loan,Product_Credit card,Product_Credit reporting,Product_Debt collection,Product_Money transfers,Product_Mortgage,Product_Other financial service,Product_Payday loan,Product_Prepaid card,Product_Student loan,Issue_APR or interest rate,"Issue_Account opening, closing, or management",Issue_Account terms and changes,Issue_Adding money,Issue_Advertising and marketing,"Issue_Advertising, marketing or disclosures",Issue_Application processing delay,"Issue_Application, originator, mortgage broker",Issue_Applied for loan/did not receive money,Issue_Arbitration,Issue_Balance transfer,Issue_Balance transfer fee,Issue_Bankruptcy,Issue_Billing disputes,Issue_Billing statement,Issue_Can't contact lender,Issue_Can't repay my loan,Issue_Can't stop charges to bank account,Issue_Cash advance,Issue_Cash advance fee,Issue_Charged bank acct wrong day or amt,Issue_Charged fees or interest I didn't expect,Issue_Closing/Cancelling account,Issue_Collection debt dispute,Issue_Collection practices,Issue_Communication tactics,Issue_Cont'd attempts collect debt not owed,Issue_Convenience checks,Issue_Credit card protection / Debt protection,Issue_Credit decision / Underwriting,Issue_Credit determination,Issue_Credit line increase/decrease,Issue_Credit monitoring or identity protection,Issue_Credit reporting,Issue_Credit reporting company's investigation,Issue_Customer service / Customer relations,Issue_Customer service/Customer relations,Issue_Dealing with my lender or servicer,Issue_Delinquent account,Issue_Deposits and withdrawals,Issue_Disclosure verification of debt,Issue_Disclosures,Issue_Excessive fees,Issue_False statements or representation,Issue_Fees,Issue_Forbearance / Workout plans,Issue_Fraud or scam,Issue_Getting a loan,Issue_Identity theft / Fraud / Embezzlement,Issue_Improper contact or sharing of info,Issue_Improper use of my credit report,Issue_Incorrect exchange rate,Issue_Incorrect information on credit report,Issue_Incorrect/missing disclosures or info,Issue_Late fee,Issue_Lender damaged or destroyed property,Issue_Lender damaged or destroyed vehicle,Issue_Lender repossessed or sold the vehicle,Issue_Lender sold the property,"Issue_Loan modification,collection,foreclosure","Issue_Loan servicing, payments, escrow account",Issue_Lost or stolen check,Issue_Lost or stolen money order,"Issue_Making/receiving payments, sending money",Issue_Managing the line of credit,Issue_Managing the loan or lease,"Issue_Managing, opening, or closing account",Issue_Money was not available when promised,Issue_Other,Issue_Other fee,Issue_Other service issues,Issue_Other transaction issues,"Issue_Overdraft, savings or rewards features",Issue_Overlimit fee,Issue_Payment to acct not credited,Issue_Payoff process,Issue_Privacy,Issue_Problems caused by my funds being low,Issue_Problems when you are unable to pay,Issue_Received a loan I didn't apply for,Issue_Repaying your loan,Issue_Rewards,Issue_Sale of account,Issue_Settlement process and costs,Issue_Shopping for a line of credit,Issue_Shopping for a loan or lease,Issue_Taking out the loan or lease,Issue_Taking/threatening an illegal action,Issue_Transaction issue,Issue_Unable to get credit report/credit score,Issue_Unauthorized transactions/trans. issues,Issue_Unexpected/Other fees,Issue_Unsolicited issuance of credit card,Issue_Using a debit or ATM card,State_AA,State_AE,State_AK,State_AL,State_AP,State_AR,State_AS,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,State_FM,State_GA,State_GU,State_HI,State_IA,State_ID,State_IL,State_IN,State_KS,State_KY,State_LA,State_MA,State_MD,State_ME,State_MH,State_MI,State_MN,State_MO,State_MP,State_MS,State_MT,State_NC,State_ND,State_NE,State_NH,State_NJ,State_NM,State_NV,State_NY,State_OH,State_OK,State_OR,State_PA,State_PR,State_PW,State_RI,State_SC,State_SD,State_TN,State_TX,State_UT,State_VA,State_VI,State_VT,State_WA,State_WI,State_WV,Tags_Older American,"Tags_Older American, Servicemember",Consumer consent provided?_Consent not provided,Consumer consent provided?_Consent provided,Consumer consent provided?_Consent withdrawn,Submitted via_Email,Submitted via_Fax,Submitted via_Phone,Submitted via_Postal mail,Submitted via_Referral,Company response to consumer_Closed,Company response to consumer_Closed with explanation,Company response to consumer_Closed with monetary relief,Company response to consumer_Closed with non-monetary relief,Company response to consumer_Closed with relief,Company response to consumer_Closed without relief,Timely response?_No
0,856103,train,No,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1034666,train,No,6.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0
2,756363,train,No,21.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
3,1474177,train,No,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,1132572,train,No,8.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0


In [29]:
df['Consumer disputed?'] = np.where(df['Consumer disputed?'] == "Yes", 1, 0)

In [30]:
# split the data into train and test as previous
train_df =df[df['Data'] =='train']
test_df = df[df['Data'] =='test']

In [31]:
train_df.columns, test_df.columns

(Index(['Complaint ID', 'Data', 'Consumer disputed?', 'day_diff',
        'Product_Bank account or service', 'Product_Consumer Loan',
        'Product_Credit card', 'Product_Credit reporting',
        'Product_Debt collection', 'Product_Money transfers',
        ...
        'Submitted via_Phone', 'Submitted via_Postal mail',
        'Submitted via_Referral', 'Company response to consumer_Closed',
        'Company response to consumer_Closed with explanation',
        'Company response to consumer_Closed with monetary relief',
        'Company response to consumer_Closed with non-monetary relief',
        'Company response to consumer_Closed with relief',
        'Company response to consumer_Closed without relief',
        'Timely response?_No'],
       dtype='object', length=187),
 Index(['Complaint ID', 'Data', 'Consumer disputed?', 'day_diff',
        'Product_Bank account or service', 'Product_Consumer Loan',
        'Product_Credit card', 'Product_Credit reporting',
        'Produ

In [32]:
# Drop unwanted columns
#train_df.drop('Data', axis=1,inplace=True)
#test_df.drop('Data','Consumer disputed?',axis=1,inplace=True)

# Drop unwanted columns
for col in ['Data','Complaint ID']:
    train_df.drop([col],1,inplace=True)

In [33]:
# Drop unwanted columns
for col in ['Data','Consumer disputed?']:
    test_df.drop([col],1,inplace=True)

In [34]:
# check if header's are consistent in both train and test
set(train_df.columns) - set(test_df.columns)

{'Consumer disputed?'}

In [35]:
test_df.dtypes

Complaint ID                                                      int64
day_diff                                                        float64
Product_Bank account or service                                   uint8
Product_Consumer Loan                                             uint8
Product_Credit card                                               uint8
Product_Credit reporting                                          uint8
Product_Debt collection                                           uint8
Product_Money transfers                                           uint8
Product_Mortgage                                                  uint8
Product_Other financial service                                   uint8
Product_Payday loan                                               uint8
Product_Prepaid card                                              uint8
Product_Student loan                                              uint8
Issue_APR or interest rate                                      

In [36]:
# Split the train dataset into features and target variable
X = train_df.drop('Consumer disputed?', axis=1)
y = train_df['Consumer disputed?']

In [37]:
X.head()

Unnamed: 0,day_diff,Product_Bank account or service,Product_Consumer Loan,Product_Credit card,Product_Credit reporting,Product_Debt collection,Product_Money transfers,Product_Mortgage,Product_Other financial service,Product_Payday loan,Product_Prepaid card,Product_Student loan,Issue_APR or interest rate,"Issue_Account opening, closing, or management",Issue_Account terms and changes,Issue_Adding money,Issue_Advertising and marketing,"Issue_Advertising, marketing or disclosures",Issue_Application processing delay,"Issue_Application, originator, mortgage broker",Issue_Applied for loan/did not receive money,Issue_Arbitration,Issue_Balance transfer,Issue_Balance transfer fee,Issue_Bankruptcy,Issue_Billing disputes,Issue_Billing statement,Issue_Can't contact lender,Issue_Can't repay my loan,Issue_Can't stop charges to bank account,Issue_Cash advance,Issue_Cash advance fee,Issue_Charged bank acct wrong day or amt,Issue_Charged fees or interest I didn't expect,Issue_Closing/Cancelling account,Issue_Collection debt dispute,Issue_Collection practices,Issue_Communication tactics,Issue_Cont'd attempts collect debt not owed,Issue_Convenience checks,Issue_Credit card protection / Debt protection,Issue_Credit decision / Underwriting,Issue_Credit determination,Issue_Credit line increase/decrease,Issue_Credit monitoring or identity protection,Issue_Credit reporting,Issue_Credit reporting company's investigation,Issue_Customer service / Customer relations,Issue_Customer service/Customer relations,Issue_Dealing with my lender or servicer,Issue_Delinquent account,Issue_Deposits and withdrawals,Issue_Disclosure verification of debt,Issue_Disclosures,Issue_Excessive fees,Issue_False statements or representation,Issue_Fees,Issue_Forbearance / Workout plans,Issue_Fraud or scam,Issue_Getting a loan,Issue_Identity theft / Fraud / Embezzlement,Issue_Improper contact or sharing of info,Issue_Improper use of my credit report,Issue_Incorrect exchange rate,Issue_Incorrect information on credit report,Issue_Incorrect/missing disclosures or info,Issue_Late fee,Issue_Lender damaged or destroyed property,Issue_Lender damaged or destroyed vehicle,Issue_Lender repossessed or sold the vehicle,Issue_Lender sold the property,"Issue_Loan modification,collection,foreclosure","Issue_Loan servicing, payments, escrow account",Issue_Lost or stolen check,Issue_Lost or stolen money order,"Issue_Making/receiving payments, sending money",Issue_Managing the line of credit,Issue_Managing the loan or lease,"Issue_Managing, opening, or closing account",Issue_Money was not available when promised,Issue_Other,Issue_Other fee,Issue_Other service issues,Issue_Other transaction issues,"Issue_Overdraft, savings or rewards features",Issue_Overlimit fee,Issue_Payment to acct not credited,Issue_Payoff process,Issue_Privacy,Issue_Problems caused by my funds being low,Issue_Problems when you are unable to pay,Issue_Received a loan I didn't apply for,Issue_Repaying your loan,Issue_Rewards,Issue_Sale of account,Issue_Settlement process and costs,Issue_Shopping for a line of credit,Issue_Shopping for a loan or lease,Issue_Taking out the loan or lease,Issue_Taking/threatening an illegal action,Issue_Transaction issue,Issue_Unable to get credit report/credit score,Issue_Unauthorized transactions/trans. issues,Issue_Unexpected/Other fees,Issue_Unsolicited issuance of credit card,Issue_Using a debit or ATM card,State_AA,State_AE,State_AK,State_AL,State_AP,State_AR,State_AS,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,State_FM,State_GA,State_GU,State_HI,State_IA,State_ID,State_IL,State_IN,State_KS,State_KY,State_LA,State_MA,State_MD,State_ME,State_MH,State_MI,State_MN,State_MO,State_MP,State_MS,State_MT,State_NC,State_ND,State_NE,State_NH,State_NJ,State_NM,State_NV,State_NY,State_OH,State_OK,State_OR,State_PA,State_PR,State_PW,State_RI,State_SC,State_SD,State_TN,State_TX,State_UT,State_VA,State_VI,State_VT,State_WA,State_WI,State_WV,Tags_Older American,"Tags_Older American, Servicemember",Consumer consent provided?_Consent not provided,Consumer consent provided?_Consent provided,Consumer consent provided?_Consent withdrawn,Submitted via_Email,Submitted via_Fax,Submitted via_Phone,Submitted via_Postal mail,Submitted via_Referral,Company response to consumer_Closed,Company response to consumer_Closed with explanation,Company response to consumer_Closed with monetary relief,Company response to consumer_Closed with non-monetary relief,Company response to consumer_Closed with relief,Company response to consumer_Closed without relief,Timely response?_No
0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,6.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0
2,21.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
3,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,8.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0


In [38]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Consumer disputed?, dtype: int32

In [39]:
# Standardising the data
scaler = StandardScaler()
train = scaler.fit_transform(X)
# here we just need to standardise the train data, no need to do this on target

In [40]:
# checking event rate
y.value_counts()

0    376990
1    101431
Name: Consumer disputed?, dtype: int64

In [41]:
# Balancing the data
ros = RandomOverSampler(random_state=18)
X_resample,y_resample = ros.fit_resample(X,y)
from collections import Counter
print(sorted(Counter(y_resample).items()),y_resample.shape)

[(0, 376990), (1, 376990)] (753980,)


In [50]:
# Split the train dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.1, random_state=18)

In [51]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(678582, 184) (75398, 184) (678582,) (75398,)


### Logistic Regression

In [52]:
# Train the model
log_reg = LogisticRegression()

In [53]:
# fitting the model
log_reg_fit = log_reg.fit(X_train,y_train)

In [54]:
# Predict the target variable for the validation set
y_pred_log = log_reg_fit.predict(X_test)

In [55]:
# Calculate the roc_auc score for the validation set
score = roc_auc_score(y_test, y_pred_log)

print("roc_auc score: ", score)

roc_auc score:  0.580152072433223


### Decision Tree

In [57]:
# Train the model
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 18,
 max_depth=5, min_samples_leaf=5)

In [58]:
# fitting the model
clf_entropy.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5,
                       random_state=18)

In [59]:
# Predict the target variable for the validation set
y_pred_en = clf_entropy.predict(X_test)

In [60]:
# Calculate the roc_auc score for the validation set
score_en = roc_auc_score(y_test, y_pred_en)

print("roc_auc score: ", score_en*100)

roc_auc score:  57.503480118132764


### XGBoost

In [61]:
# Train the model
xg_boost = XGBClassifier()

In [62]:
# fitting the model
xg_boost.fit(X_train,y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [63]:
# Predict the target variable for the validation set
y_pred_xgb = xg_boost.predict(X_test)

In [64]:
# Calculate the roc_auc score for the validation set
score = roc_auc_score(y_test, y_pred_xgb)

print("roc_auc score: ", score*100)

roc_auc score:  59.54947263908801


### Random Forest


In [65]:
# Train the model
rd = RandomForestClassifier()

In [69]:
# fitting the model
rdf = rd.fit(X_train,y_train)

In [70]:
# Predict the target variable for the validation set
yprd = rdf.predict(X_test)

In [71]:
# Predict the target variable for the test set
test_pred_rd = rd.predict(test_df.drop('Complaint ID',axis=1))

In [72]:
# Calculate the roc_auc score for the validation set
score = roc_auc_score(y_test, yprd)
print("roc_auc score: ", score*100)

roc_auc score:  68.98035319438168


In [None]:
submission_df = pd.DataFrame({'Complaint ID': test['Complaint ID'], 'Consumer disputed?': test_pred_rd})
submission_df.to_csv('Consumer_Complaints_Resolution_rdf_08.csv', index=False)