In [1]:
import sys
sys.path.append('..')

import data
import re
import random

In [2]:
# Conecting to DATA BASE 
db = data.Database()
df = db.dataframe()
df.head()

Unnamed: 0,Name,Type,Level,Rarity,Damage,Health,Energy,Sanity,Timestamp
0,Wyvern,Dragon,3,Rank 1,3d4,10.85,11.21,12.16,2023-11-20 10:16:06
1,Dust Mephit,Elemental,10,Rank 0,10d2+4,20.9,20.7,20.44,2023-11-20 10:16:06
2,Pseudodragon,Dragon,11,Rank 3,11d8+2,90.31,85.06,86.75,2023-11-20 10:16:06
3,Efreeti,Elemental,12,Rank 3,12d8,98.21,98.76,97.51,2023-11-20 10:16:06
4,Copper Drake,Dragon,3,Rank 4,3d10+3,27.67,33.02,31.43,2023-11-20 10:16:06


In [3]:
# Function to parse the damage column in numeric values.
def parse_damage(damage_string):
    match = re.match(r'(\d+)d(\d+)([+-]\d+)?', damage_string)
    if match:
        num_dice, die_type, modifier = match.groups()
        num_dice, die_type = int(num_dice), int(die_type)
        modifier = int(modifier) if modifier else 0  # Handle the case where modifier is not present
        result = sum(random.randint(1, die_type) for _ in range(num_dice)) + modifier
        return result
    else:
        # Return a default value (e.g., 0) for invalid formats
        return 0


# Apply the function to the "Damage" column
df['N_Damage'] = df['Damage'].apply(parse_damage)

# Print the updated DataFrame to work in the models
df[['Level', 'N_Damage', 'Health', 'Energy', 'Sanity', "Rarity"]]

Unnamed: 0,Level,N_Damage,Health,Energy,Sanity,Rarity
0,3,5,10.85,11.21,12.16,Rank 1
1,10,20,20.90,20.70,20.44,Rank 0
2,11,62,90.31,85.06,86.75,Rank 3
3,12,41,98.21,98.76,97.51,Rank 3
4,3,17,27.67,33.02,31.43,Rank 4
...,...,...,...,...,...,...
995,8,30,48.51,49.77,47.92,Rank 2
996,2,4,3.85,4.19,4.03,Rank 0
997,2,5,4.68,4.80,3.45,Rank 0
998,5,23,39.51,42.54,37.32,Rank 3


## Testing model 

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [5]:
# Transform the target Y (Rarity)
enc = OrdinalEncoder()
df['Rarity'] = enc.fit_transform(df[['Rarity']])
df['Rarity'] = df['Rarity'].astype(int)

## Split Data 

In [6]:
# Create features matrix:
X = df[['Level', 'N_Damage', 'Health', 'Energy', 'Sanity']]
y = df['Rarity']

# Split data for training and testing:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y)

# Checking shape for new train and test data 
print(f'This is the shape of X_train: {X_train.shape}')
print(f'This is the shape of X_test: {X_test.shape}')
print('*' * 40)
print(f'This is the shape of y_train: {y_train.shape}')
print(f'This is the shape of y_test: {y_test.shape}')

This is the shape of X_train: (800, 5)
This is the shape of X_test: (200, 5)
****************************************
This is the shape of y_train: (800,)
This is the shape of y_test: (200,)


## Stablish Baseline 

In [7]:
print(f'Baseline: {y_train.value_counts(normalize=True).max():.2f}')

Baseline: 0.32


## Model 1 - Logistic Regression

In [8]:
# Create model
lr_model = LogisticRegressionCV(multi_class="ovr",
                                cv=5,
                                random_state=42)

In [9]:
# Param dict for grid search
grid_params = {
    "Cs" : list(range(6, 16, 2)),
    "cv" : [2, 3, 5, 7, 10],
}

# Instantiate and fit grid search
lr_grid = GridSearchCV(lr_model, 
                       param_grid=grid_params)

lr_grid.fit(X_train, y_train)

In [10]:
# Getting best stimator and printing acuracy reports
lr_train_accuracy = lr_grid.best_estimator_.score(X_train, y_train)
lr_test_accuracy = lr_grid.best_estimator_.score(X_test, y_test)

print(f'Baseline: {y_train.value_counts(normalize=True).max():.2f}')
print(f'Training Accuracy: {lr_train_accuracy:.2f}')
print(f'Test Accuracy: {lr_test_accuracy:.2f}')

Baseline: 0.32
Training Accuracy: 0.73
Test Accuracy: 0.74


## Model 2 - Random Forest

In [11]:
# Create pipeline
rf_pipe = Pipeline([
    ("scale",StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42))
])

# Param dict for grid search
rf_param_dict = {
    "clf__n_estimators": [100, 300],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 3],
    "clf__min_samples_leaf": [1, 2],
    "clf__criterion": ["gini", "entropy"]
}

# Instantiate and fit GridSearch
rf_grid = GridSearchCV(rf_pipe, rf_param_dict, cv=5)
rf_grid.fit(X_train, y_train)

In [12]:
# Getting best stimator and printing acuracy reports
rf_train_accuracy = rf_grid.best_estimator_.score(X_train, y_train)
rf_test_accuracy = rf_grid.best_estimator_.score(X_test, y_test)

print(f'Baseline: {y_train.value_counts(normalize=True).max():.2f}')
print(f'Training Accuracy: {rf_train_accuracy:.2f}')
print(f'Test Accuracy: {rf_test_accuracy:.2f}')

Baseline: 0.32
Training Accuracy: 1.00
Test Accuracy: 0.98


In [85]:
rf_grid.best_params_

{'clf__criterion': 'gini',
 'clf__max_depth': None,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__n_estimators': 300}

## Model 3 - XGBoost

In [48]:
# XGBoost pipeline 
xgb_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("clf", xgb.XGBClassifier(objective="multi:softmax",
                              random_state=42))
])

# Param dict for XGBoost grid search
xgb_param_dict = {
    "clf__n_estimators": [100],
    "clf__learning_rate": [0.3],         
    "clf__subsample": [0.8],           
    "clf__colsample_bytree": [1.0],    
    "clf__reg_alpha": [0],       
    "clf__reg_lambda": [0]
}

# Applaying SMOTE in this model
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Instantiate and fit XGBoost GridSearch
xgb_grid = GridSearchCV(xgb_pipe, xgb_param_dict, cv=5)
xgb_grid.fit(X_train_resampled, y_train_resampled)

In [50]:
# Getting best stimator and printing acuracy reports
xgb_train_accuracy = xgb_grid.best_estimator_.score(X_train_resampled, y_train_resampled)
xgb_test_accuracy = xgb_grid.best_estimator_.score(X_test, y_test)

print(f'Baseline: {y_train.value_counts(normalize=True).max():.2f}')
print(f'Training Accuracy: {xgb_train_accuracy:.2f}')
print(f'Test Accuracy: {xgb_test_accuracy:.2f}')

Baseline: 0.32
Training Accuracy: 1.00
Test Accuracy: 0.98


### Final observation to select model for base on Acuracy

In [78]:
print("Logistic Regresion")
print("*"*20)
print(f'Baseline: {y_train.value_counts(normalize=True).max():.2f}')
print(f'Training Accuracy: {lr_train_accuracy:.2f}')
print(f'Test Accuracy: {lr_test_accuracy:.2f}')
print("-"*25)
print("Random Forest")
print("*"*20)
print(f'Baseline: {y_train.value_counts(normalize=True).max():.2f}')
print(f'Training Accuracy: {rf_train_accuracy:.2f}')
print(f'Test Accuracy: {rf_test_accuracy:.2f}')
print("-"*25)
print("XGBoost")
print("*"*20)
print(f'Baseline: {y_train.value_counts(normalize=True).max():.2f}')
print(f'Training Accuracy: {xgb_train_accuracy:.2f}')
print(f'Test Accuracy: {xgb_test_accuracy:.2f}')

Logistic Regresion
********************
Baseline: 0.32
Training Accuracy: 0.73
Test Accuracy: 0.74
-------------------------
Random Forest
********************
Baseline: 0.32
Training Accuracy: 1.00
Test Accuracy: 0.98
-------------------------
XGBoost
********************
Baseline: 0.32
Training Accuracy: 1.00
Test Accuracy: 0.98


## Conclusion for:
- Logistic Regression
- Random Forest
- XGBoost.

##### Logistic Regression exhibited modest training and test accuracies of 0.73 and    0.74, respectively. While it displayed fair generalization, its performance fell short of the remarkable results achieved by the ensemble models.

##### The decision to choose the Random Forest model was underpinned by its exceptional test accuracy of 0.98, surpassing both the baseline and Logistic Regression. Despite a perfect training accuracy, indicating potential overfitting concerns, the model demonstrated robust generalization to new data, making it a compelling choice. Notably, Random Forest's computational efficiency, evident in its faster runtime compared to XGBoost on the identical test set, further validated its suitability for the classification task.

##### Similarly, XGBoost mirrored Random Forest's prowess with a test accuracy of 0.98. While both models performed exceptionally well on the test set, Random Forest's efficiency in terms of computational resources solidified its position as the final and optimal choice for this specific dataset and problem context.