In [None]:
#!pip install torch

In [1]:
import json
import pandas as pd
import numpy as np
import joblib 
import re
import xgboost as xgb
import matplotlib.pyplot as plt
import io, os
from os import path
import datetime as dt
import pickle as pkl
import random
from tqdm import tqdm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import datetime
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix, precision_recall_curve, matthews_corrcoef
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

from transformers import AutoTokenizer, AutoModel
import torch

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows',200)

In [2]:
def build_dataset():
    data = [json.loads(x) for x in open("MLA_100k.jsonlines")]
    target = lambda x: x.get("condition")
    N = -10000
    X_train = data[:N]
    X_test = data[N:]
    y_train = [target(x) for x in X_train]
    y_test = [target(x) for x in X_test]
    for x in X_test:
        del x["condition"]
    return X_train, y_train, X_test, y_test

def normalize_data(X, y):    
    y = pd.Series(y).replace({'new':1, 'used':0}).astype(int).copy()
    print('Normalizando json...')
    X = pd.json_normalize(X)
    return X, y

def formatear_utf_8(s):    
    '''Remueve caracteres y encodea a utf-8'''    
    import unicodedata
    try:
        text = unicode(text, 'utf-8')
    except NameError: 
        pass    
    
    s = s.lower()
    # Replace ips
    s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' y ')
    s = s.replace('@', ' arroba ')
    s = s.replace('0', ' cero ')
    s = s.replace('1', ' uno')
    s = s.replace('2', ' dos ')
    s = s.replace('3', ' tres ')
    s = s.replace('4', ' cuatro ')
    s = s.replace('5', ' cinco ')
    s = s.replace('6', ' seis ')
    s = s.replace('7', ' siete ')
    s = s.replace('8', ' ocho ')
    s = s.replace('9', ' nueve ')
    s = re.sub('[^A-Za-z0-9]+', ' ', unicodedata.normalize('NFD', s).encode('ascii', 'ignore').decode("utf-8"))    
    return str( s)


def warranty(df_flatted):
    '''
    Quick feature processing for warranty.
    - Fills missing values with 'sin_datos'
    - Standardizes values and categorizes them
    
    Parameters:
    df_flatted (DataFrame): The flattened DataFrame with a 'warranty' column.
    
    Returns:
    DataFrame: The DataFrame with the processed 'warranty' column.
    '''
    print('Procesando warranty...')
    
    # List of known warranty categories
    lista_warranty = ['sin_datos', 'si', 'sin garantia']
    
    # Fill NaN values, convert to lowercase, and apply utf-8 formatting
    df_flatted['warranty'] = df_flatted['warranty'].fillna('sin_datos').str.lower().apply(formatear_utf_8)
    
    # Define mapping rules
    mapping_rules = [
        ('fabr', 'si'),
        ('meses', 'si'),
        ('reputacion', 'sin garantia'),
        ('con garantia', 'si'),
        ('garantia total', 'si'),
        ('ano', 'si')
    ]
    
   
    def map_warranty(warranty):
        for substring, replacement in mapping_rules:
            if substring in warranty:
                return replacement
        return warranty
    
    df_flatted['warranty'] = df_flatted['warranty'].apply(map_warranty)    
    df_flatted['warranty'] = df_flatted['warranty'].apply(lambda x: x if x in lista_warranty else 'otros')
    
    return df_flatted


def tags(df_flatted):
    '''
    exploto tags
    dropeo duplicados de los casos con doble tag y me quedo con el de la clase minoritaria (para mas info)
    lleno nan con no_info
    
    '''
    print('Procesando tags...')
    list_tags = ['dragged_bids_and_visits', 'no_info']
    
    tags_df = df_flatted['tags'].explode()
    tags_df.fillna('no_info', inplace=True)
    tags_df = tags_df.apply(lambda x: x if x in list_tags else 'otros')
    df_flatted['tags'] = tags_df[~tags_df.index.duplicated(keep='first')]
    
    return df_flatted

def process_title(df):
    
    print('Formateando features a utf-8...')    
    df_nlp = df.copy()         
    shape_init = len(df_nlp)
    
    #renombro nan a sin datos
    df_nlp['title'] = df_nlp['title'].fillna(value='nan')
    
    #paso todo a lower case
    df_nlp['title'] = df_nlp['title'].str.lower()    
    
    #formateo todo a utf-8
    df_nlp['title'] = df_nlp['title'].apply(lambda x: formatear_utf_8(x))    
    
    if shape_init - len(df_nlp) != 0:
        print('se perdieron registros, chequear')  
    return df_nlp


def listings_type_id(df_flatted):
    
    print('Procesando listings...')
    
    df_flatted['listing_type_id'] = df_flatted['listing_type_id'].apply(lambda x: 'gold' if 'gold' in x else x).copy()

    df_flatted['listing_type_id'].replace({'free':1,
                                           'bronze':2,
                                           'silver':3,
                                           'gold':4}, inplace=True)   
    return df_flatted


# Cargar la data desde json

In [25]:
print("Loading dataset...")

X_train, y_train, X_test, y_test = build_dataset()

X_train, y_train = normalize_data(X_train, y_train)
X_test, y_test = normalize_data(X_test, y_test) 

Loading dataset...
Normalizando json...
Normalizando json...


In [26]:
X_train['condition'] = X_train['condition'].replace({'new':1, 'used':0})
X_train = process_title(X_train)
X_train = tags(X_train)
X_train = warranty(X_train)
X_train = listings_type_id(X_train)

Formateando features a utf-8...
Procesando tags...
Procesando warranty...
Procesando listings...


In [27]:
useful_cols = ['warranty', 'listing_type_id', 'price', 'buying_mode','category_id','title','condition','tags','initial_quantity']
X_train = X_train[useful_cols]

## probamos creando embeddings para title

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

In [None]:
def get_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [None]:
if torch.cuda.is_available():
    print("GPU is available and being used by PyTorch.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("No GPU available, PyTorch is using the CPU.")

In [None]:
embeddings = []
for title in tqdm(X_train['title']):
    embedding = get_embeddings(title, tokenizer, model)
    embeddings.append(embedding.squeeze().numpy())

# Convert embeddings list to a DataFrame
embeddings_df = pd.DataFrame(embeddings)

# Concatenate the embeddings DataFrame with the original DataFrame
df_with_embeddings = pd.concat([X_train, embeddings_df], axis=1)

In [None]:
df_with_embeddings.to_csv('dataset_embeddings.csv')

In [None]:
df_with_embeddings

## Probamos random forest simple

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [33]:
# Separate features and target
X = X_train.copy().drop('condition', axis=1)
y = X_train['condition']

X.drop(['category_id', 'title'], axis=1, inplace=True)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()

encoder = OneHotEncoder(sparse=False, drop='first')
X_encoded = encoder.fit_transform(X[categorical_features])

X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))

X_encoded_df.columns = X_encoded_df.columns.astype(str)

X = X.drop(categorical_features, axis=1)
X = pd.concat([X, X_encoded_df], axis=1)

X.columns = X.columns.astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Initialize and train the RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.82


In [36]:
import shap

In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")


## Usando PyCaret

In [24]:
from pycaret.classification import *
clf1 = setup(X_train, target = 'condition', session_id=123, log_experiment=True, experiment_name='pycaret_primer_run')

ValueError: Invalid value for the target parameter. Column condition not found in the data.

In [38]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.5372,0.5,1.0,0.5372,0.699,0.0,0.0,0.203
ridge,Ridge Classifier,0.5366,0.0,0.9936,0.5372,0.6973,-0.0003,-0.0019,0.166
lda,Linear Discriminant Analysis,0.5366,0.5017,0.9936,0.5372,0.6973,-0.0003,-0.0019,0.216
qda,Quadratic Discriminant Analysis,0.5273,0.4986,0.8825,0.536,0.655,-0.0028,-0.0011,0.171
nb,Naive Bayes,0.5222,0.5029,0.7987,0.5048,0.559,-0.0002,-0.0008,0.18
knn,K Neighbors Classifier,0.5188,0.5003,0.7655,0.5366,0.6309,-0.0021,-0.0023,0.444
gbc,Gradient Boosting Classifier,0.5144,0.4975,0.7065,0.5366,0.5969,-0.0023,-0.0028,0.673
rf,Random Forest Classifier,0.5092,0.4987,0.6391,0.5362,0.5819,-0.0025,-0.0025,0.374
lightgbm,Light Gradient Boosting Machine,0.5089,0.5009,0.6038,0.5381,0.5639,0.0027,0.0028,0.24
ada,Ada Boost Classifier,0.508,0.4973,0.6337,0.5351,0.5705,-0.0042,-0.0042,0.359


In [39]:
rf = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5146,0.5068,0.685,0.5378,0.6025,0.0019,0.002
1,0.514,0.5027,0.6696,0.5382,0.5968,0.003,0.0031
2,0.5097,0.5024,0.6448,0.5362,0.5855,-0.0024,-0.0024
3,0.4984,0.4916,0.5173,0.5343,0.5257,-0.0062,-0.0062
4,0.5075,0.4973,0.6428,0.5346,0.5838,-0.007,-0.0072
5,0.5029,0.4957,0.6242,0.5318,0.5743,-0.0141,-0.0143
6,0.5049,0.4948,0.5737,0.5368,0.5546,-0.0013,-0.0013
7,0.5097,0.4957,0.6363,0.5369,0.5824,-0.0011,-0.0011
8,0.5192,0.5012,0.7477,0.5378,0.6256,0.0016,0.0018
9,0.5112,0.4984,0.6498,0.5373,0.5882,0.0001,0.0001


In [None]:
interpret_model(rf, plot='summary')

In [None]:
import mlflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [None]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
profile = ProfileReport(data, title="Profiling Report")

In [None]:
profile.to_file("problem_report.html")