In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)    
    
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#!pip install sweetviz
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
training_og = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/train.csv.zip')
test_og = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/test.csv.zip')
sample = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/sample_submission.csv.zip')

training_og['train_test'] = 1 # ---> to later on be able to identify the two sets, and split them from the all_data set
test_og['train_test'] = 0 # ---> to later on be able to identify the two sets, and split them from the all_data set
all_data = pd.concat([training_og, test_og])

%matplotlib inline
all_data.columns

# Light Data Exploration
 
1) For numeric data

    Make histograms to understand distributions
    Corrplot
    Pivot table comparing survival rate across numeric variables

2) For Categorical Data

    Make bar charts to understand balance of classes
    Make pivot tables to understand relationship with target

In [None]:
#import sweetviz as sv
#data_report = sv.analyze(training)
#data_report.show_html('test.html')
#from IPython.display import IFrame
#IFrame(src = 'test.html',width=1000,height=600)

In [None]:
num_cols = ['bone_length', 'rotting_flesh', 'hair_length', 'has_soul']
cat_cols = ['color', 'type']

In [None]:
# distributions for all numeric variables 
for i in num_cols:
    plt.hist(training_og[i])
    plt.title(i)
    plt.show()

In [None]:
# corrplot numeric features
print(training_og[num_cols].corr())
sns.heatmap(training_og[num_cols].corr())

In [None]:
# Comparing target and each of the categorical variables 

print(pd.pivot_table(training_og, index = 'type', columns = 'color', aggfunc ='count'))

# Feature Engineering

In [None]:
# Let's drop the 'id' column from training and test. Since this might misguide the prediction model (it does!)
training = training_og.drop('id', axis=1)
test = test_og.drop('id', axis=1)

In [None]:
# log transform numerical columns
import numpy as np 

log_training_num_cols = np.log(training[num_cols])
print(log_training_num_cols.hist())

log_test_num_cols = np.log(test[num_cols])
#print(log_test_num_cols.hist().hist()) -> error message for some reason

In [None]:
# create a training set with log transformed numeric values
log_training = training.copy()
for i in num_cols:
    log_training[i] = log_training_num_cols[i].values
    
# create a test set with log transformed numeric values
log_test = test.copy()
for i in num_cols:
    log_test[i] = log_test_num_cols[i].values

In [None]:
# TRAINING DF
# instantiate target and transform to numerical 
y = log_training.type

label_enc = LabelEncoder()
label_enc.fit(y)
y = label_enc.transform(y)

# drop the unnecessary columns from log transformed data
log_training.pop('train_test')
log_training.pop('type')

# drop the unnecessary columns from untouched data
training.pop('train_test')
training.pop('type')

# rename log X-set
X_log = log_training

# rename untouched X-set
X = training

In [None]:
# TEST DF
# drop the unnecessary columns from log transformed data
log_test.pop('train_test')

# drop the unnecessary columns from untouched data
test.pop('train_test')


# DATASETS ARE READY, we can start modeling

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [None]:
# create pipeline with OH encoder, scaler and model

oh = OneHotEncoder(handle_unknown='ignore')
sc = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('oh', oh, ['color']),
        ('scal', sc, ['bone_length', 'rotting_flesh', 'hair_length', 'has_soul'])
    ], remainder='passthrough')

In [None]:
# Get cross val score per model automatically (non-logged datasets)

svc = SVC()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
nb = GaussianNB()
xgb = xgb.XGBClassifier()

models = [svc, rf, knn, nb, xgb]
scores = {}

In [None]:
# Get cross val score per model automatically (non-logged datasets)
for i in models:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', i)
                             ])
    model_score = cross_val_score(pipe, X, y, cv=10).mean()
    scores[i] = model_score

scores

In [None]:
# Get cross val score per model automatically  (logged datasets)

scores_log = {}

# Get cross val score per model automatically  (logged datasets)

for i in models:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', i)
                             ])
    model_score = cross_val_score(pipe, X_log, y, cv=10).mean()
    scores_log[i] = model_score

scores_log

* SVC (non-logged) and RandomForestClassifier (logged) are performing best.
* Approx scores: 0.71. 

Transforming columns (e.g. scaler, imputer, etc) outputs a *numpy array*, rather than a dataframe, where the column names have been removed and the columns have changed place!

To create 'normal' view: create pd.dataframe with the array

In [None]:
X.head()

In [None]:
X_log.head()

In [None]:
X = preprocessor.fit_transform(X)
X_log = preprocessor.fit_transform(X_log)

In [None]:
# parameter tuning for svc

svc = SVC()
parameter_grid = {'C': [0.001, 0.01, 0.1, 1, 10],
    'kernel' :  ['linear', 'rbf', 'sigmoid'],
    'gamma' : [0.001, 0.01, 0.1, 1]}

folds = 3
param_comb = 5


skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(svc, param_distributions=parameter_grid, n_iter=param_comb, scoring='roc_auc', n_jobs=-1, cv=skf.split(X_log,y), verbose=3, random_state=1001 )

# Here we go
grid_result = random_search.fit(X_log, y)
print(grid_result.best_estimator_)
print(grid_result.best_score_)



Output of transformer is numpy.ndarray, not DataFrame! 

best estimator for both log and non-log datasets: 


In [None]:
# parameter tuning for randomforest
params = {
        'n_estimators': [10, 100, 300, 700, 2000],
        'max_depth':[None, 5, 10, 20],
        'min_samples_leaf': [1, 2, 3, 4, 5, 6],
        'max_features': [5, 8, 10, 12, None]
        }


skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(rf, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=-1, cv=skf.split(X,y), verbose=3, random_state=1001 )

# Here we go
grid_result = random_search.fit(X, y)
print(grid_result.best_estimator_)
print(grid_result.best_score_)


best estimator for both log and non-log datasets: 

RandomForestClassifier(max_depth=5, max_features=8, min_samples_leaf=2,
                       n_estimators=2000)

In [None]:
# Evaluate random forest on non-logged training df

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

metrics = [mean_absolute_error, mean_squared_error, r2_score, accuracy_score]

rf = RandomForestClassifier(max_depth=5, max_features=8, min_samples_leaf=2, n_estimators=2000)
rf.fit(X_train, y_train)
preds = rf.predict(X_valid)
for i in metrics:
    a = i(y_valid, preds)
    print(f'random forest score (non-log) {i}: {a}')

In [None]:
# Evaluate random forest on logged training df

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train_log, X_valid_log, y_train_log, y_valid_log = train_test_split(X_log, y, test_size=0.33, random_state=42)

metrics = [mean_absolute_error, mean_squared_error, r2_score, accuracy_score]

rf = RandomForestClassifier(max_depth=5, max_features=8, min_samples_leaf=2, n_estimators=2000)
rf.fit(X_train_log, y_train_log)
preds = rf.predict(X_valid_log)
for i in metrics:
    a = i(y_valid_log, preds)
    print(f'random forest score (log) {i}: {a}')

In [None]:
# Evaluate svc on logged training df

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train_log, X_valid_log, y_train_log, y_valid_log = train_test_split(X_log, y, test_size=0.33, random_state=42)

metrics = [mean_absolute_error, mean_squared_error, r2_score, accuracy_score]

svc = SVC(C=0.01, gamma=1)
svc.fit(X_train_log, y_train_log)
preds = svc.predict(X_valid_log)
for i in metrics:
    a = i(y_valid_log, preds)
    print(f'svc score (log) {i}: {a}')


In [None]:
# Evaluate svc on non-logged training df

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

metrics = [mean_absolute_error, mean_squared_error, r2_score, accuracy_score]

svc = SVC(C=0.01, gamma=1)
svc.fit(X_train, y_train)
preds = svc.predict(X_valid)
for i in metrics:
    a = i(y_valid, preds)
    print(f'svc score (non-log) {i}: {a}')

In [None]:
# From an error encountered later, we find out there are -inf values in the log_test dataset, because of the log transform.
# Instantiate negative infinity value
negative_infinity = float('-inf')
negative_infinity
# We localize these values:
log_test.iloc[log_test.values==negative_infinity]

In [None]:
# Replace the negative infinity values with the median of their columns
columns = ['bone_length', 'rotting_flesh', 'hair_length', 'has_soul']
for i in columns:
    log_test[i] = log_test[i].replace([negative_infinity], log_test[i].median())

In [None]:
log_test.iloc[log_test.values==negative_infinity]

In [None]:
test = preprocessor.fit_transform(test)
log_test = preprocessor.fit_transform(log_test)

In [None]:
svc = SVC(C=0.01, gamma=1)

svc.fit(X_log, y)
svc_preds = svc.predict(log_test)

In [None]:
# the submission needs to be in string-form ('ghoul','ghost', 'goblin')
# the target variable y was label encoded before, so we need to reverse that encoding procedure

new_svc_preds = label_enc.inverse_transform(svc_preds)

In [None]:
rf = RandomForestClassifier(max_depth=5, max_features=8, min_samples_leaf=2, n_estimators=2000)

rf.fit(X_log, y)
rf_preds = rf.predict(log_test)

new_rf_preds = label_enc.inverse_transform(rf_preds)

In [None]:
submission_rf_log = pd.DataFrame({"id": sample.id, "type": new_rf_preds})

submission_rf_log.to_csv("submission_rf_log.csv", index=False)

submission_rf_log.head()



In [None]:
submission_csv_log = pd.DataFrame({"id": sample.id, "type": new_svc_preds})

submission_csv_log.to_csv("submission_csv_log.csv", index=False)

# submission_csv['type'].nunique()
submission_csv_log.head()


WHY IS CSV ONLY PRODUCING 'ghoul' PREDICTIONS???