# Loading packages

In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Loading train and test sets

In [None]:
train = pd.read_csv('../input/glcm-training/glcm_training.csv')
test = pd.read_csv('../input/siimglcmtest/glcm_test.csv')

### Previewing training and testing sets

In [None]:
train.head()

In [None]:
test.head()

### Determining distribution of missing values in train set

In [None]:
percent_missing = train.isnull().sum() * 100 / len(train)
missing_vals_df = pd.DataFrame({'Percent Missing': percent_missing})
print(missing_vals_df)

# Preprocessing

Codifying categorical variables and normalizing numeric features

### Sectioning into categorical and numerical features

In [None]:
train.info()

In [None]:
num_features = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','target']
print("Numerical features:", num_features)

### Defining preprocessor

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler

class PreprocessTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, num_features):

        self.num_features = num_features
    
    def fit(self, X, y=None):
        return self
  
    def transform(self, X, y=None): 
        dataframe = X.copy()

        # Normalize numerical features
        scaler = MinMaxScaler()
        dataframe[self.num_features] = scaler.fit_transform(dataframe[num_features])
        
        return dataframe

### Preprocessing train set

In [None]:
# Preprocessing categorical and numerical features
train_processed = PreprocessTransformer(num_features).transform(X = train)

# Imputing missing values 
#train_noNan = pd.DataFrame(SimpleImputer().fit_transform(train_processed))
#train_noNan.columns = train_processed.columns
#
#train_noNan.head()

In [None]:
#percent_missing = train_noNan.isnull().sum() * 100 / len(train_noNan)
#missing_vals_df = pd.DataFrame({'Percent Missing': percent_missing})
#print(missing_vals_df)

# Random Forest Classifier

## Splitting into X (input) and y (output)

In [None]:
X_train = train_processed.copy().drop(columns = ['target','image_name'])
y_train = train_processed.copy().drop(columns = ['image_name'])
y_train = train_processed.copy()['target']

X_train.head()

## Hyperparameter adjustment

In [None]:
search_space = [
  {
     'max_depth': [10, 20, 30, 40, 50, 60, None],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10],
     'n_estimators': [200, 400, 600, 800, 1000]
  }
]

cv_method = StratifiedKFold(n_splits=4, shuffle = True, random_state=0)
scoring = {'AUC':make_scorer(roc_auc_score)}

## Fitting random forest classifier to train set

In [None]:
optimizer = RandomizedSearchCV(
  estimator = RandomForestClassifier(),
  param_distributions=search_space,
  cv=cv_method,
  scoring=scoring,
  refit='AUC',
  return_train_score = True,
  verbose=1,
  n_iter = 50,
)

# Approximately 1 hour run time with GPU assistance
rf_model = optimizer.fit(X_train, y_train)

In [None]:
# Display mean AUC score
optimizer.cv_results_['mean_test_AUC'].mean()

In [None]:
# Display most important parameters
optimizer.best_params_

## Displaying feature importance

In [None]:
features = X_train.columns
imp_dict = {features[i]:optimizer.best_estimator_.feature_importances_[i] for i in range(len(features))}
imp_dict = sorted(imp_dict.items(), key=lambda x: x[1])
print(imp_dict)

plt.bar(*zip(*imp_dict))
plt.xticks(rotation="vertical")
plt.show()

## Predicting on test set

In [None]:
test.head()

In [None]:
test_copy = test.copy().drop(columns = ['image_name'])


num_features = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o']
test_processed = PreprocessTransformer(num_features).transform(X = test_copy)

test_noNan = pd.DataFrame(SimpleImputer().fit_transform(test_processed))
test_noNan.columns = test_processed.columns

test_processed.head()

In [None]:
y_pred = rf_model.predict_proba(test_processed[features])
pd.DataFrame(y_pred).head()

In [None]:
y_pred_malignant = [p[1] for p in y_pred]

In [None]:
submission = pd.DataFrame()
submission['image_name'] = test.image_name.values
submission['target'] = y_pred_malignant

submission.to_csv('submission.csv',index = False)
submission.head()