In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from classification_model.config import numerical_features, categorical_features,\
                                        date_features, features, new_numerical_features
from classification_model.evaluation import generate_report, confusion_matrix,\
                                            calculate_metrics, metrics_summary
from classification_model.custom_pipeline import CalculateAntiquity, NumberChannels, ConvertDtypes,\
                                                 ColumnSelector, GetDummies, GetDataFrame, Proportion

In [2]:
train_data = pd.read_csv('data/train_data.csv', sep=';')
y_train = pd.read_csv('data/train_label.csv', sep=';')
test_data = pd.read_csv('data/test_data.csv', sep=';')
y_test = pd.read_csv('data/test_label.csv', sep=';')

In [3]:
train_data.head(2)

Unnamed: 0,person_id,time,offer_id,amount,gender,age,became_member_on,income,difficulty,duration,offer_type,web,mobile,email,social,reward
0,374007b9d7d547f0ba956cf84039ca8f,408,f19421c1d4aa40978ebb69ca19b0e20d,0.0,M,59.0,20161015.0,83000.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0,0
1,f395ba6ec7d64a5880ceef5aefb7a23a,12,ae264e3637204a6fb9bb56bc8210ddfd,0.0,F,55.0,20180122.0,75000.0,10.0,7.0,bogo,0.0,1.0,1.0,1.0,0


In [4]:
general_transformations = Pipeline([('dtypes', ConvertDtypes(numerical=numerical_features,
                                                             categorical=categorical_features,
                                                             date=date_features)),
                                    ('number_channels', NumberChannels(columns=['mobile', 'web', 'social', 'email'])),
                                    ('antiquity', CalculateAntiquity(column=date_features[0])),
                                    ('proportion', Proportion(numerator='reward',
                                                              denominator='difficulty'))])

numerical_transformations = Pipeline([('selector', ColumnSelector(columns=new_numerical_features)),
                                      ('scaler', StandardScaler()),
                                      ('df', GetDataFrame(columns=new_numerical_features))])

categorical_transformations = Pipeline([('selector', ColumnSelector(columns=categorical_features)),
                                        ('ohe', GetDummies(columns=categorical_features))])

preprocessor = Pipeline([('transformations', general_transformations),
                         ('features', FeatureUnion([
                             ('numerical', numerical_transformations),
                             ('categorical', categorical_transformations)
                         ])),
                         ('df', GetDataFrame(columns=features))])
