# Recommendation System

#Importing the Libraries

In [48]:
import os
os.chdir('../src/')

import pandas
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score


import utilities
from importlib import reload
import utilities
utilities = reload(utilities)

#Loading Data

In [20]:
MAX_ROWS = 10000
df_raw_train = pandas.read_csv("../data/raw/train_ver2.csv", nrows=MAX_ROWS)
df_raw_test = pandas.read_csv("../data/raw/test_ver2.csv", nrows=MAX_ROWS )

In [3]:
df_raw_train.shape

(10000, 48)

In [4]:
df_raw_train.iloc[0]

fecha_dato                      2015-01-28
ncodpers                           1375586
ind_empleado                             N
pais_residencia                         ES
sexo                                     H
age                                     35
fecha_alta                      2015-01-12
ind_nuevo                              0.0
antiguedad                               6
indrel                                 1.0
ult_fec_cli_1t                         NaN
indrel_1mes                            1.0
tiprel_1mes                              A
indresi                                  S
indext                                   N
conyuemp                               NaN
canal_entrada                          KHL
indfall                                  N
tipodom                                1.0
cod_prov                              29.0
nomprov                             MALAGA
ind_actividad_cliente                  1.0
renta                              87218.1
segmento   

In [5]:
df_raw_train.isnull().sum()

fecha_dato                   0
ncodpers                     0
ind_empleado                50
pais_residencia             50
sexo                        50
age                          0
fecha_alta                  50
ind_nuevo                   50
antiguedad                   0
indrel                      50
ult_fec_cli_1t            9988
indrel_1mes                 50
tiprel_1mes                 50
indresi                     50
indext                      50
conyuemp                 10000
canal_entrada               50
indfall                     50
tipodom                     50
cod_prov                    54
nomprov                     54
ind_actividad_cliente       50
renta                     1899
segmento                    50
ind_ahor_fin_ult1            0
ind_aval_fin_ult1            0
ind_cco_fin_ult1             0
ind_cder_fin_ult1            0
ind_cno_fin_ult1             0
ind_ctju_fin_ult1            0
ind_ctma_fin_ult1            0
ind_ctop_fin_ult1            0
ind_ctpp

In [23]:
columns = ['fecha_dato', 'ncodpers', 'sexo', 'antiguedad', 'age', 'renta'] + list(df_raw_train.columns[24:-4])
df_train = df_raw_train[columns].copy()

# Cleaning antiguedad
df_train = utilities.impute_median(df_train, 'antiguedad')
# Cleaning age
df_train = utilities.impute_median(df_train, 'age')
# cleaning sex
df_train['sexo'] = df_train['sexo'].fillna('H')
# clean income imputing median
df_train['renta'] = df_train['renta'].fillna(df_train['renta'].dropna().median())

# make dummies for sexo
df_train['sexo'] = pandas.get_dummies(df_train['sexo'], drop_first=True)

In [24]:
df_train.isnull().sum()

fecha_dato           0
ncodpers             0
sexo                 0
antiguedad           0
age                  0
renta                0
ind_ahor_fin_ult1    0
ind_aval_fin_ult1    0
ind_cco_fin_ult1     0
ind_cder_fin_ult1    0
ind_cno_fin_ult1     0
ind_ctju_fin_ult1    0
ind_ctma_fin_ult1    0
ind_ctop_fin_ult1    0
ind_ctpp_fin_ult1    0
ind_deco_fin_ult1    0
ind_deme_fin_ult1    0
ind_dela_fin_ult1    0
ind_ecue_fin_ult1    0
ind_fond_fin_ult1    0
ind_hip_fin_ult1     0
ind_plan_fin_ult1    0
ind_pres_fin_ult1    0
ind_reca_fin_ult1    0
ind_tjcr_fin_ult1    0
ind_valo_fin_ult1    0
dtype: int64

In [25]:
df_train.columns

Index(['fecha_dato', 'ncodpers', 'sexo', 'antiguedad', 'age', 'renta',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1'],
      dtype='object')

In [26]:
X_train = df_train.loc[:, ['sexo', 'antiguedad', 'age', 'renta']]
y_train = df_train['ind_ahor_fin_ult1']

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

In [45]:
pred = clf.predict(X_train)

In [46]:
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
product_columns = [c for c in df_train.columns if c.startswith('ind') and c.endswith('ult1')]

models = []

known_variable = []
for p in product_columns:
    X_train = df_train.loc[:, ['sexo', 'antiguedad', 'age', 'renta']]
    y_train = df_train[p]

    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    models.append(clf)
    known_variable = X_train.columns

In [29]:
df_test = df_raw_test[known_variable].copy()
df_test.isnull().sum()

# Cleaning antiguedad
# df_test = impute_median(df_test, 'antiguedad')
# Cleaning age
# df_test = impute_median(df_test, 'age')
# cleaning sex
df_test['sexo'] = df_test['sexo'].fillna('H')
# clean income imputing median
df_test = utilities.impute_median(df_test, 'renta')

# make dummies for sexo
df_test['sexo'] = pandas.get_dummies(df_test['sexo'], drop_first=True)

In [30]:
models[3].predict(df_test[known_variable])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
collect_predictions = []
for m in models:
    collect_predictions.append(m.predict(df_test[known_variable]))

In [32]:
collect_predictions

[array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64)]

In [50]:


#Cross Validation for Randam forest classifier

#clf = RandomForestClassifier(max_depth=2, random_state=0)
cv_results = cross_validate(clf, X_train, y_train, cv=10, scoring=['r2'])
scores = cv_results['test_r2']
print(f"Scores = {scores}, {scores.mean()}, {scores.std()}")


Scores = [-0.00300903 -0.00300903 -0.00300903 -0.00300903 -0.00300903 -0.00300903
 -0.00300903 -0.00300903 -0.00401606 -0.00401606], -0.0032104345164003954, 0.00040281487031377237


In [52]:
#model Evaluation
rforest_acc = accuracy_score(pred, df_test[known_variable])
print("Test Accuracy score is:", rforest_acc*100)


ValueError: Classification metrics can't handle a mix of binary and multiclass-multioutput targets

In [53]:
df_predictions = pandas.DataFrame(numpy.array(collect_predictions).T, columns=product_columns)
df_predictions['ncodpers'] = df_raw_test['ncodpers']
result = []

for df_p in df_predictions.iterrows():
    cur_row = [df_p[1]['ncodpers'], ' '.join(list(df_p[1][df_p[1] == 1].keys()))]
    result.append(cur_row)

In [35]:
pandas.DataFrame(result, columns=['ncodpers', 'added_products']).to_csv("../data/final/submission.csv", index=False)

Logistic Regression Model Tranning:

In [36]:
logreg = LogisticRegression(random_state=0)
logreg.fit(X_train, y_train)

In [54]:
pred1 = logreg.predict(X_train)

In [55]:
pred1

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [38]:
product_columns = [c for c in df_train.columns if c.startswith('ind') and c.endswith('ult1')]

models = []

known_variable = []
for p in product_columns:
    X_train = df_train.loc[:, ['sexo', 'antiguedad', 'age', 'renta']]
    y_train = df_train[p]

    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    models.append(clf)
    known_variable = X_train.columns

In [39]:
df_test = df_raw_test[known_variable].copy()
df_test.isnull().sum()

# Cleaning antiguedad
# df_test = impute_median(df_test, 'antiguedad')
# Cleaning age
# df_test = impute_median(df_test, 'age')
# cleaning sex
df_test['sexo'] = df_test['sexo'].fillna('H')
# clean income imputing median
df_test = utilities.impute_median(df_test, 'renta')

# make dummies for sexo
df_test['sexo'] = pandas.get_dummies(df_test['sexo'], drop_first=True)

In [40]:
models[3].predict(df_test[known_variable])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [41]:
collect_predictions = []
for m in models:
    collect_predictions.append(m.predict(df_test[known_variable]))

In [42]:
collect_predictions

[array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64)]

In [43]:
#Cross Validation for Logistic Regression Model


cv_results = cross_validate(logreg, X_train, y_train, cv=10, scoring=['r2'])
scores = cv_results['test_r2']
print(f"Scores = {scores}, {scores.mean()}, {scores.std()}")


Scores = [-0.00300903 -0.00300903 -0.00300903 -0.33734537 -0.33734537 -0.00300903
 -0.33734537 -0.00300903 -0.00401606 -0.00401606], -0.10351133722452473, 0.1530807807254734


In [56]:
#model Evaluation
logreg_acc = accuracy_score(pred1, df_test[known_variable])
print("Test Accuracy score is:", logreg_acc*100)

ValueError: Classification metrics can't handle a mix of binary and multiclass-multioutput targets

In [44]:
df_predictions = pandas.DataFrame(numpy.array(collect_predictions).T, columns=product_columns)
df_predictions['ncodpers'] = df_raw_test['ncodpers']
result = []

for df_p in df_predictions.iterrows():
    cur_row = [df_p[1]['ncodpers'], ' '.join(list(df_p[1][df_p[1] == 1].keys()))]
    result.append(cur_row)

In [57]:
pandas.DataFrame(result, columns=['ncodpers', 'added_products']).to_csv("../data/final/submission1.csv", index=False)