In [1]:
import numpy as np
import pandas as pd

In [2]:
# Loaded the train and test data
file=r'/Users/subhadeep/Downloads/P1 Data/Consumer_Complaints_train.csv'
file1=r'/Users/subhadeep/Downloads/P1 Data/Consumer_Complaints_test_share.csv'
bd_train=pd.read_csv(file)
bd_test=pd.read_csv(file1)

## looking at the shape of the data
print(bd_train.shape)
print(bd_test.shape)
(478421, 18)
(119606, 17)

In [3]:
bd_train.dtypes

Date received                   object
Product                         object
Sub-product                     object
Issue                           object
Sub-issue                       object
Consumer complaint narrative    object
Company public response         object
Company                         object
State                           object
ZIP code                        object
Tags                            object
Consumer consent provided?      object
Submitted via                   object
Date sent to company            object
Company response to consumer    object
Timely response?                object
Consumer disputed?              object
Complaint ID                     int64
dtype: object

In [4]:
# Checking Missing values in the training file.
bd_train.isnull().sum()

Date received                        0
Product                              0
Sub-product                     138473
Issue                                0
Sub-issue                       292625
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Tags                            411215
Consumer consent provided?      342934
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
dtype: int64

In [5]:
# Looking at the ISSUE colum and found the most used lines in the documents.We will be doing text analysis in this column
bd_train['Issue'].value_counts()

Loan modification,collection,foreclosure    80302
Incorrect information on credit report      58527
Loan servicing, payments, escrow account    51403
Cont'd attempts collect debt not owed       36367
Account opening, closing, or management     23568
Disclosure verification of debt             16235
Communication tactics                       15312
Deposits and withdrawals                    14721
Application, originator, mortgage broker    11201
Billing disputes                             9600
Credit reporting company's investigation     9492
Other                                        9442
Managing the loan or lease                   8905
Problems caused by my funds being low        7758
False statements or representation           7074
Unable to get credit report/credit score     7060
Dealing with my lender or servicer           6460
Improper contact or sharing of info          6182
Problems when you are unable to pay          5921
Settlement process and costs                 5834


In [6]:
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Creating a function name cleandata where text will be be splitted and will remove stopwords. poerter stemmer will stem
# all the same kid of words to their originam words.
stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
   
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
       
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
   
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [8]:
bd_train['Issue']= bd_train['Issue'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, 
                                                                        stemming=True))
bd_test['Issue']= bd_test['Issue'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, 
                                                                        stemming=True))

In [9]:
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=None)

In [10]:
tfidfdata = tfidfvec.fit_transform(bd_train['Issue'])

In [11]:
df1=pd.DataFrame(tfidfdata.toarray(),columns=tfidfvec.get_feature_names())

In [12]:
# After dropping issue column , we are joining 2 dataframes
bd_train.drop('Issue',axis=1, inplace=True)
bd_train_new=pd.concat([bd_train,df1],axis=1)

In [13]:
tfidfdata_test=tfidfvec.transform(bd_test['Issue'])
df2=pd.DataFrame(tfidfdata_test.toarray(),columns=tfidfvec.get_feature_names())

In [14]:
bd_test.drop('Issue',axis=1, inplace=True)
bd_test_new=pd.concat([bd_test,df2],axis=1)

In [15]:
# the new dataframe has 147 variables
bd_train_new

Unnamed: 0,Date received,Product,Sub-product,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,...,transfer,unabl,unauthor,underwrit,unsolicit,use,verif,withdraw,workout,wrong
0,2014-05-15,Credit card,,,,,Wells Fargo & Company,MI,48342,Older American,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1,2014-09-18,Bank account or service,(CD) Certificate of deposit,,,,Santander Bank US,PA,18042,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
2,2014-03-13,Credit reporting,,Account status,,,Equifax,CA,92427,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
3,2015-07-17,Credit card,,,"My credit card statement from US Bank, XXXX. X...",Company chooses not to provide a public response,U.S. Bancorp,GA,305XX,Older American,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
4,2014-11-20,Credit card,,,,,Bank of America,MA,02127,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
5,2014-06-26,Mortgage,Other mortgage,,,,Bank of America,WI,54313,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
6,2012-09-28,Mortgage,Other mortgage,,,,Ocwen,FL,33168,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
7,2015-05-06,Mortgage,FHA mortgage,,I have a mortgage with Quicken Loans. From XXX...,,Quicken Loans,GA,300XX,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
8,2013-02-25,Credit card,,,,,Discover,OH,45640,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
9,2016-03-30,Bank account or service,Checking account,,I put my settlement check in to bank of americ...,Company has responded to the consumer and the ...,Bank of America,TX,787XX,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.707107,0.0,0.0


In [16]:
bd_test_new

Unnamed: 0,Date received,Product,Sub-product,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,...,transfer,unabl,unauthor,underwrit,unsolicit,use,verif,withdraw,workout,wrong
0,2014-01-18,Bank account or service,Cashing a check without an account,,,,Bank of America,CA,95691,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.707107,0.0,0.0
1,2016-03-31,Debt collection,Credit card,Debt was paid,,,"National Credit Adjusters, LLC",FL,32086,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
2,2012-03-08,Mortgage,Conventional adjustable mortgage (ARM),,,,Wells Fargo & Company,CA,94618,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
3,2016-01-07,Credit reporting,,Problem getting report or credit score,,Company chooses not to provide a public response,"TransUnion Intermediate Holdings, Inc.",FL,33584,Older American,...,0.0,0.440951,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
4,2013-08-23,Mortgage,FHA mortgage,,,,Bank of America,FL,33543,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
5,2013-12-12,Credit reporting,,Account status,,,Equifax,DC,20018,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
6,2015-03-16,Debt collection,"Other (i.e. phone, health club, etc.)",Debt resulted from identity theft,,,Real Time Resolutions,TX,75249,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
7,2016-04-29,Credit reporting,,Problem with fraud alerts,,Company has responded to the consumer and the ...,Experian,CA,91791,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
8,2014-12-17,Debt collection,Non-federal student loan,Frequent or repeated calls,,,"MRS BPO, L.L.C.",WA,99403,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
9,2013-03-19,Credit card,,,,,Citibank,IL,60640,,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0


In [17]:
# we are changing both the below colums to a datetime format
for col in ['Date received','Date sent to company']:
    bd_train_new[col]=pd.to_datetime(bd_train_new[col],infer_datetime_format=True)
    bd_test_new[col]=pd.to_datetime(bd_test_new[col],infer_datetime_format=True)

In [18]:
# creating a new column day_diff, which is difference between the 2 columns
bd_train_new['day_diff']=pd.to_numeric(bd_train_new['Date sent to company']-bd_train_new['Date received'])
bd_test_new['day_diff']=pd.to_numeric(bd_test_new['Date sent to company']-bd_test_new['Date received'])

In [19]:
#Keeping day_diff  column and dropping both the columns
for col in ['Date received','Date sent to company']:
    bd_train_new.drop([col],1,inplace=True)
    bd_test_new.drop([col],1,inplace=True)

In [20]:
bd_train_new['Consumer disputed?']=np.where(bd_train_new['Consumer disputed?']=="Yes",1,0)

In [21]:
for col in bd_train_new.select_dtypes(['object']).columns:
    print(col,':',bd_train_new[col].nunique())

Product : 12
Sub-product : 47
Sub-issue : 68
Consumer complaint narrative : 74019
Company public response : 10
Company : 3276
State : 62
ZIP code : 25962
Tags : 3
Consumer consent provided? : 4
Submitted via : 6
Company response to consumer : 7
Timely response? : 2


In [22]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    bd_train_new[varname]=np.where(pd.isnull(bd_train_new[col]),1,0)
    bd_train_new.drop([col],1,inplace=True)
    bd_test_new[varname]=np.where(pd.isnull(bd_test_new[col]),1,0)
    bd_test_new.drop([col],1,inplace=True)

In [23]:
# dropping zip code and company columns
for col in ['ZIP code','Company']:
    bd_train_new.drop([col],1,inplace=True)
    bd_test_new.drop([col],1,inplace=True)

In [24]:
for col in bd_train_new.select_dtypes(['object']).columns:
    print(col,':',bd_train_new[col].nunique())

Product : 12
State : 62
Submitted via : 6
Company response to consumer : 7
Timely response? : 2


In [25]:
# creating dummy variables
or col in ['Product','Submitted via','Company response to consumer','Timely response?']:
    
    dum=pd.get_dummies(bd_train_new[col],prefix=col,drop_first=True)
    bd_train_new=pd.concat([dum,bd_train_new],1)
    bd_train_new.drop([col],1,inplace=True)
    
    dum1=pd.get_dummies(bd_test_new[col],prefix=col,drop_first=True)
    bd_test_new=pd.concat([dum1,bd_test_new],1)
    bd_test_new.drop([col],1,inplace=True)

In [26]:
for col in ['State']:
    freqs=bd_train_new[col].value_counts()
    k=freqs.index[freqs>=10000][:-1]
    for cat in k:
        name=col+'_'+cat
        bd_train_new[name]=(bd_train_new[col]==cat).astype(int)
        bd_test_new[name]=(bd_test_new[col]==cat).astype(int)
    del bd_train_new[col]
    del bd_test_new[col]

In [27]:
bd_train_new.dtypes

Timely response?_Yes                                              uint8
Company response to consumer_Closed with explanation              uint8
Company response to consumer_Closed with monetary relief          uint8
Company response to consumer_Closed with non-monetary relief      uint8
Company response to consumer_Closed with relief                   uint8
Company response to consumer_Closed without relief                uint8
Company response to consumer_Untimely response                    uint8
Submitted via_Fax                                                 uint8
Submitted via_Phone                                               uint8
Submitted via_Postal mail                                         uint8
Submitted via_Referral                                            uint8
Submitted via_Web                                                 uint8
Product_Consumer Loan                                             uint8
Product_Credit card                                             

In [28]:
bd_test_new.dtypes

Timely response?_Yes                                              uint8
Company response to consumer_Closed with explanation              uint8
Company response to consumer_Closed with monetary relief          uint8
Company response to consumer_Closed with non-monetary relief      uint8
Company response to consumer_Closed with relief                   uint8
Company response to consumer_Closed without relief                uint8
Company response to consumer_Untimely response                    uint8
Submitted via_Fax                                                 uint8
Submitted via_Phone                                               uint8
Submitted via_Postal mail                                         uint8
Submitted via_Referral                                            uint8
Submitted via_Web                                                 uint8
Product_Consumer Loan                                             uint8
Product_Credit card                                             

In [29]:
from sklearn.model_selection import KFold,train_test_split

In [30]:
# splitting 80:20 fro training and testing dataset.
bd1_train,bd1_test=train_test_split(bd_train_new,test_size=0.2,random_state=2)

In [31]:
x_train=bd1_train.drop(['Consumer disputed?','Complaint ID'],axis=1)
y_train=bd1_train['Consumer disputed?']
x_test=bd1_test.drop(['Consumer disputed?','Complaint ID'],axis=1)
y_test=bd1_test['Consumer disputed?']

In [32]:
x_train.index

Int64Index([226990, 316207, 142207,   6823, 299194, 370150, 238649, 416826,
             39806,  99119,
            ...
            437511, 342376,  33867,  84434, 424235, 437782,  95816, 203245,
            100879, 351400],
           dtype='int64', length=382736)

In [33]:
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [34]:
x_train.index

RangeIndex(start=0, stop=382736, step=1)

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
lr=LogisticRegression(class_weight='balanced')

In [38]:
lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [42]:
predicted_lr=lr.predict(x_test)

In [43]:
from sklearn.metrics import roc_auc_score

In [44]:
roc_auc_score(y_test,predicted_lr)

0.5308276531968509

In [45]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [42]:
import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import RandomizedSearchCV



In [43]:
clf = RandomForestClassifier(verbose=1,n_jobs=-1)

In [44]:
# hyperparameter tuning
param_dist = {"n_estimators":[10,100,300,500],
              "max_depth": [3,5,10,20, None],
              "max_features": [5,10,20,40],
              "min_samples_split": [2,5,10],
              "min_samples_leaf": [1,5,10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

n_iter_search = 40

In [45]:
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='roc_auc',cv=10,n_jobs=-1)
random_search.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jo

[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   58.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   57.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks     

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.0min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.0min finished
[Parallel(n_jobs=4)]: Do

[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   57.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_job

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   19.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parall

[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 37.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 43.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   41.0s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:   46.7s finished
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 44.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elap

[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks    

[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 41.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   51.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   28.8s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:   31.8s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   51.9s finished
[Parall

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jo

[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 12.6min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    7.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    8.2s finished
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    6.0s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    6.7s finished
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 12.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    7.5s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    8.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  42 task

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elap

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.6min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]:

[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.5s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.6s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 442 task

[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    4.3s finished
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks   

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 300 

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False),
          fit_params={}, iid=True, n_iter=40, n_jobs=-1,
          param_distributions={'n_estimators': [10, 100, 300, 500], 'max_depth': [3, 5, 10, 20, None], 'max_features': [5, 10, 20, 40], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='roc_auc', verbose=0)

In [46]:
 def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [47]:
report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.624 (std: 0.004)
Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 20, 'max_depth': 20, 'criterion': 'gini', 'bootstrap': True}

Model with rank: 2
Mean validation score: 0.624 (std: 0.004)
Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 20, 'max_depth': 20, 'criterion': 'gini', 'bootstrap': False}

Model with rank: 3
Mean validation score: 0.622 (std: 0.004)
Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 10, 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}



In [48]:
rf=RandomForestClassifier(n_estimators=100,verbose=1,criterion='gini',min_samples_split=10,
                         bootstrap=True,max_depth=20,max_features=20,min_samples_leaf=5,
                          class_weight="balanced")

In [49]:
rf.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.6min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=20, max_features=20,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=5,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=1, warm_start=False)

In [50]:
predicted_rf=rf.predict(x_test)

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.9s finished


In [51]:
roc_auc_score(y_test,predicted_rf)

0.5874471825807449

In [52]:
prediction=np.where(rf.predict(bd_test_new.drop(['Complaint ID'],1))==1,"Yes","No")
submission=pd.DataFrame(list(zip(bd_test_new['Complaint ID'],list(prediction))),
                       columns=['Complaint ID','Consumer disputed?'])

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.3s finished


In [53]:
prediction=np.where(rf.predict(bd_test_new.drop(['Complaint ID'],1))==1,"Yes","No")
submission=pd.DataFrame(list(zip(bd_test_new['Complaint ID'],list(prediction))),
                       columns=['Complaint ID','Consumer disputed?'])

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.3s finished


In [54]:
submission.to_csv('sample_submission_final.csv',index=False)

In [60]:
submission.head()

Unnamed: 0,Complaint ID,Consumer disputed?
0,675956,Yes
1,1858795,No
2,32637,Yes
3,1731374,No
4,501487,Yes
