# Summary
* Load corpus.
* Load corpus features, clusters ids list, labels of each cluster.
* Prepared data to train the cluster.

In [None]:
from ipynb.fs.full.logistic_regression_functions import *
from ipynb.fs.full.pre_processing_functions import process_sentence
from ipynb.fs.full import paths

### Load Corpus 

In [None]:
data, original_data = load_corpus()

### Get features of corpus and labels from kmeans

In [None]:
# Features to train the classifier.
data_feature = pd.read_pickle(paths.DF_PATH)
# Ids of the cluster for each sentence.
cluster_ids = pd.read_pickle(paths.IDS_CLUSTER_PATH)
# Classes for the classifier.
with open(paths.CLUSTER_LABELS_PATH, 'r') as file:
    labels = file.read().splitlines()

### Prepare data to train the classifier

In [None]:
# join vectors and labels
text_label = [[data_feature.loc[ind].values, cluster_ids.loc[ind,0]] for ind in range(0,len(data_feature))]

# create DataFrame
df = pd.DataFrame(text_label, columns = ['text','label'])

data_as_str = [' '.join(x) for x in data]

text_as_str_label = [[data_as_str[ind], cluster_ids.loc[ind,0]] for ind in range(0, len(data_feature))]
df_with_str = pd.DataFrame(text_as_str_label, columns = ['text','label'])

### Train model with function 1

In [None]:
field='text'
top_k=3

model,accuracy,mrr_at_k=train_model(df)
print("\nAccuracy={0}; MRR={1}".format(accuracy,mrr_at_k))

### Training model with function 2

In [None]:
field='text'
top_k=3
feature_rep = 'binary' #binary, counts, tf-idf 

model, feature_transformer, accuracy, mrr = train_model_with_transformer(df_with_str, feature_rep=feature_rep)
print("\nAccuracy={0}; MRR={1}".format(accuracy,mrr_at_k))

## Some examples

In [None]:
samples = [
    'Hola vamos a comprar dolares pesos bonos lo que dinero',
    'Che a la tarde voy a tu casa a tomar unos mates y estudiamos para el parcial',
    'Feliz cumpleaños numero 100 Martin , espero que pases un lindo dia ahi en el PAMI',
    'Hola tio , estamos pensando para este fin de semana comer un asado en la casa de los abuelos , nos vemos alla',
    'Antes de irme , tengo que presentar todos los documentos para sacar la visa de trabajo , me piden un monton de certificados',
]

for sample in samples:
    test_features = feature_transformer.transform(process_sentence(sample))
    labels_ind = get_top_k_predictions(model, test_features, 2)
    print('Sample: ', sample)
    print('Los labels para este ejemplo son:', [labels[x] for x in labels_ind[0]])
    print('')

### Save model and transformer

In [None]:
# we need to save both the transformer and model 
# transformer to encode/vectorize per our settings
# model to predict
pickle.dump(model,open(paths.CLASSIFIER_MODEL_PATH, 'wb'))
pickle.dump(feature_transformer,open(paths.TRANSFORMER_PATH,'wb'))