In [41]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib

from sklearn.model_selection import train_test_split

# http://contrib.scikit-learn.org/categorical-encoding/
# pip install category_encoders
from category_encoders import OneHotEncoder

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.preprocessing import Imputer

from sklearn.pipeline import make_union, make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
df = pd.read_pickle("postulaciones_visitas_train.pkl")
avisos = pd.read_pickle("avisos.pkl")
postulantes = pd.read_pickle("postulantes.pkl")
visitas_train = pd.read_pickle("visitas_train.pkl")
visitas_test = pd.read_pickle("visitas_test.pkl")

In [5]:
df.sample(5)

Unnamed: 0,idaviso,idpostulante,target,visita_cantidad
1137844,1112257621,5YzOM,False,0.0
1475604,1112349430,wVkrbZe,False,0.0
805525,1112427118,NwzLoV,True,1.0
2452063,1112438787,eBGqj8,True,1.0
1706601,1112296054,8Z1MML,False,0.0


In [6]:
class ItemSelector(BaseEstimator, TransformerMixin):
    # See http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html#example-hetero-feature-union-py

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class FillNaN(BaseEstimator, TransformerMixin):

    def __init__(self, replace=0):
        self.replace = replace

    def fit(self, x, y=None):
        return self

    def transform(self, serie):
        return serie.fillna(self.replace)

class Pass(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X


In [8]:
df = pd.merge(df, avisos, how='left', on='idaviso')
df = pd.merge(df, postulantes, how='left', on='idpostulante')

In [9]:
df.sample(5)

Unnamed: 0,idaviso,idpostulante,target,visita_cantidad,titulo,descripcion,nombre_zona,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa,sexo,edad,educacion_nivel,educacion_estado,educacion
1414262,1112451640,PmGQd2v,True,1.0,Diseñadora/r Gráfica/o para Importante Estudio...,<p>La búsqueda se orienta a candidatos recibid...,Gran Buenos Aires,Full-time,Senior / Semi-Senior,Diseño,Manpower,False,29.0,Universitario,Graduado,11.0
1548694,1112303484,96MEpKa,False,0.0,Responsable del Departamento de Sistemas Infor...,"<p style=""""><span style="""">Esta persona será r...",Gran Buenos Aires,Full-time,Senior / Semi-Senior,Tecnologia / Sistemas,FUNDACION INSTITUTO DE TECNOLOGIAS NUCLEARES P...,False,26.0,Otro,En Curso,4.0
969151,1112438897,MVrMkZp,False,0.0,Administrador de Red y Comunicaciones,"<p style=""""><span style="""">En Iké Asistencia A...",Gran Buenos Aires,Full-time,Senior / Semi-Senior,Redes,Iké Asistencia Argentina,True,59.0,Secundario,Graduado,2.0
1266474,1112423910,bOjQRo0,False,0.0,Coordinador de Servicios de Limpieza,"<p>Importante empresa de servicios, ubicada en...",Gran Buenos Aires,Full-time,Jefe / Supervisor / Responsable,Planeamiento comercial,DECIDE SRL,True,27.0,Secundario,Graduado,2.0
1264196,1111806483,bO435R0,False,0.0,MUSIMUNDO sucursal Escobar Jumbo busca VENDEDORES,<p>Para nuestra sucursal de Escobar Jumbo sele...,Gran Buenos Aires,Full-time,Senior / Semi-Senior,Ventas,Musimundo SA,True,33.0,Otro,Graduado,5.0


In [10]:
pipeline = make_pipeline(
    make_union(
        ItemSelector(['sexo']),
        ItemSelector(['visita_cantidad']),
        make_pipeline(
            #ItemSelector(['edad', 'educacion']),
            ItemSelector(['edad']),
            Imputer()
        ),
        make_pipeline(
            ItemSelector(['educacion_nivel', 'educacion_estado', 'nombre_zona', 'tipo_de_trabajo', 'nivel_laboral', 'nombre_area']),#, 'denominacion_empresa']),
            FillNaN('Other'),
            OneHotEncoder()
        )
    )
)

In [11]:
X = pipeline.fit_transform(df)
y = df['target']

In [12]:
X.shape

(2797443, 239)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2018)

In [14]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [15]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.99986130196
             precision    recall  f1-score   support

      False       1.00      1.00      1.00    349680
       True       1.00      1.00      1.00    349681

avg / total       1.00      1.00      1.00    699361

[[349583     97]
 [     0 349681]]


In [43]:
# Save pipeline and model
joblib.dump(pipeline, "pipeline.gz")
joblib.dump(model, "model.gz")

['model.gz']