In [1]:
import numpy as np
import pandas as pd
import warnings


from mypipes_consumer import *

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'/home/siddarth.jha@npci.org.in/Documents/Training/Data/Consumer_Complaints_train.csv')
df.head(5)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2014-05-15,Credit card,,Billing statement,,,,Wells Fargo & Company,MI,48342,Older American,,Web,2014-05-16,Closed with explanation,Yes,No,856103
1,2014-09-18,Bank account or service,(CD) Certificate of deposit,"Making/receiving payments, sending money",,,,Santander Bank US,PA,18042,,,Referral,2014-09-24,Closed,Yes,No,1034666
2,2014-03-13,Credit reporting,,Incorrect information on credit report,Account status,,,Equifax,CA,92427,,,Referral,2014-04-03,Closed with non-monetary relief,Yes,No,756363
3,2015-07-17,Credit card,,Billing statement,,"My credit card statement from US Bank, XXXX. X...",Company chooses not to provide a public response,U.S. Bancorp,GA,305XX,Older American,Consent provided,Web,2015-07-17,Closed with monetary relief,Yes,No,1474177
4,2014-11-20,Credit card,,Transaction issue,,,,Bank of America,MA,02127,,,Web,2014-11-28,Closed with explanation,Yes,No,1132572


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 478421 non-null  object
 1   Product                       478421 non-null  object
 2   Sub-product                   339948 non-null  object
 3   Issue                         478421 non-null  object
 4   Sub-issue                     185796 non-null  object
 5   Consumer complaint narrative  75094 non-null   object
 6   Company public response       90392 non-null   object
 7   Company                       478421 non-null  object
 8   State                         474582 non-null  object
 9   ZIP code                      474573 non-null  object
 10  Tags                          67206 non-null   object
 11  Consumer consent provided?    135487 non-null  object
 12  Submitted via                 478421 non-null  object
 13 

In [4]:
df['Issue'].value_counts()

Loan modification,collection,foreclosure    80302
Incorrect information on credit report      58527
Loan servicing, payments, escrow account    51403
Cont'd attempts collect debt not owed       36367
Account opening, closing, or management     23568
                                            ...  
Lost or stolen check                           20
Incorrect exchange rate                        13
Lender damaged or destroyed vehicle             5
Lender sold the property                        2
Lender damaged or destroyed property            1
Name: Issue, Length: 95, dtype: int64

In [5]:
'''
P1:
Date received, Date sent to company: Extract weekday , month , day of month & create cyclic features 


P2:
Product, Issue, Company public response, Company, State, Tags,Consumer consent provided?, 
Submitted via, Company response to consumer
        : Convert to dummy variables, impute missing values, convert to numeric


P3:
Timely response?, Consumer disputed?: Convert to 1 & 0 values, impute missing values, convert to numeric


P4:
Consumer complaint narrative, Complaint ID, ZIP code,Sub-issue   : DROP

P5:

'''

'\nP1:\nDate received, Date sent to company: Extract weekday , month , day of month & create cyclic features \n\n\nP2:\nProduct, Issue, Company public response, Company, State, Tags,Consumer consent provided?, \nSubmitted via, Company response to consumer\n        : Convert to dummy variables, impute missing values, convert to numeric\n\n\nP3:\nTimely response?, Consumer disputed?: Convert to 1 & 0 values, impute missing values, convert to numeric\n\n\nP4:\nConsumer complaint narrative, Complaint ID, ZIP code,Sub-issue   : DROP\n\nP5:\n\n'

In [6]:
p1 = pdPipeline([
    ('var_selector', VarSelector(['Date received', 'Date sent to company'])),
    ('dt_conversion', datetime_conversion())
])

p2 = pdPipeline([
    ('var_selector', VarSelector(['Product', 'Issue', 'Company public response', 'Company', 'State', 'Tags', 
                                 'Consumer consent provided?', 'Submitted via', 'Company response to consumer'])),
    ('missing_trt', MissingValues()),
    ('create_dummies', convert_to_dummy(15000))
])

p3 = pdPipeline([
    ('var_selector', VarSelector(['Timely response?', 'Consumer disputed?'])),
    ('bool_conversion', bool_to_int())
])

p4 = pdPipeline([
    ('var_selector', VarSelector(['Date received', 'Date sent to company'])),
    ('cyclic_conversion', cyclic_features())
])

p5 = pdPipeline([
    ('var_selector', VarSelector(['Date received', 'Date sent to company'])),
    ('dt_diff', dt_difference())
])


data_pipe = FeatureUnion([
#    ('dt', p1),
    ('obj_to_dum', p2),
    ('bool_to_num', p3),
    ('dt_to_cyclic', p4),
    ('dt_to_diff', p5)
])


In [7]:
data_pipe.fit(df)

FeatureUnion(transformer_list=[('obj_to_dum',
                                pdPipeline(steps=[('var_selector',
                                                   VarSelector(feature_names=['Product',
                                                                              'Issue',
                                                                              'Company '
                                                                              'public '
                                                                              'response',
                                                                              'Company',
                                                                              'State',
                                                                              'Tags',
                                                                              'Consumer '
                                                                              'consent '
   

In [8]:
len(data_pipe.get_feature_names())

63

In [9]:
df_train=pd.DataFrame(data=data_pipe.transform(df),
                    columns=data_pipe.get_feature_names())

In [10]:
p5.fit(df)

pdPipeline(steps=[('var_selector',
                   VarSelector(feature_names=['Date received',
                                              'Date sent to company'])),
                  ('dt_diff', dt_difference())])

In [11]:
x = p5.transform(df)

In [12]:
type(df[['Date received']])

pandas.core.frame.DataFrame

In [13]:
x['date_difference'].value_counts()

0      209750
1       58939
2       38005
3       30711
4       28639
        ...  
993         1
346         1
382         1
328         1
626         1
Name: date_difference, Length: 398, dtype: int64

In [14]:
x

Unnamed: 0,date_difference
0,1
1,6
2,21
3,0
4,8
...,...
478416,1
478417,1
478418,0
478419,3
