In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data fields

id - id of the creature

bone_length - average length of bone in the creature, normalized between 0 and 1

rotting_flesh - percentage of rotting flesh in the creature

hair_length - average hair length, normalized between 0 and 1

has_soul - percentage of soul in the creature

color - dominant color of the creature: 'white','black','clear','blue','green','blood'

type - target variable: 'Ghost', 'Goblin', and 'Ghoul'

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import missingno as ms

In [None]:
train = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/train.csv.zip')
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/test.csv.zip')
test.head()

In [None]:
sub = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/sample_submission.csv.zip')
sub.head()

All the datasets have been loaded into their respective dataframes

Now let us get started with the EDA

In [None]:
import pandas_profiling as pp

In [None]:
train.shape # 371 rows of training data

In [None]:
test.shape

In [None]:
train.describe()

The above table gives us some idea about how our training data looks like`

In [None]:
report = pp.ProfileReport(train) # best way in my opinion to get started with analyzing the dataset
report.to_file('EDA_ghoul.html')

In [None]:
train.columns

In [None]:
train.isnull().sum() # Number of null values in each column

In [None]:
ms.matrix(train) # Good way to quickly check which columns have null values

The above analysis suggest that there are no missing values in the training dataset

In [None]:
train.info() # gives the number of non null values and the data types of column

In [None]:
# Function to find the number of missing values in each column
def missing(df):
    nul = {}
    for col_i in df.columns:
        col_nn = df[col_i].value_counts().sum()
        tot = len(df)
        null = tot - col_nn
        nul[col_i] = null
    return nul
missing(train)

In [None]:
# we do not need the id column for our data analysis so we drop it
df = train.drop('id', axis = 1)
df.head()

## 1) bone_length

In [None]:
missing(train)['bone_length']

In [None]:
sns.set(style = 'darkgrid')
sns.distplot(train.bone_length, bins = 10)

We can see that the bone_length is somewhat normal

most of the creatures have the normalized length in between 0.4 to 0.6 with  less creatures having very long or very small bone length

In [None]:
bl_mean = train.bone_length.mean()
bl_std = train.bone_length.std()
ll = bl_mean - 3*bl_std
ul = bl_mean + 3*bl_std

In [None]:
len(train[(train['bone_length'] > ul) | (train['bone_length'] < ll)])

Looks like the 'bone_length' column has no outliers

In [None]:
sns.boxplot(train.type, train.bone_length)

The above box plot shows that the bone length is an important feature to determine the type of creature we are dealing with as:

Ghouls tend to have longer bones than goblins which have longer bones than ghosts

## 2) rotting_flesh

In [None]:
missing(train)['rotting_flesh']

In [None]:
sns.distplot(train.rotting_flesh)

In [None]:
sns.boxplot(train.type, train.rotting_flesh)

Ghosts tend to have a higher percentage of rotting flesh compared to other creatures

In [None]:
rf_mean = train.rotting_flesh.mean()
rf_std = train.rotting_flesh.std()
ll = rf_mean - 3*rf_std
ul = rf_mean + 3*rf_std

In [None]:
len(train[(train['rotting_flesh'] > ul) | (train['rotting_flesh'] < ll)])

No outliers in the 'rotting_flesh' column too

## 3) Hair_length

In [None]:
missing(train)['hair_length']

In [None]:
sns.distplot(train.hair_length)

In [None]:
hl_mean = train.hair_length.mean()
hl_std = train.hair_length.std()
ll = hl_mean - 3*hl_std
ul = hl_mean + 3*hl_std

In [None]:
len(train[(train['hair_length'] > ul) | (train['hair_length'] < ll)])

In [None]:
sns.boxplot(train.type, train.hair_length)

The boxplot depicts clearly that the ghouls have longer hair than goblins which have longer hair than Ghosts

## 4) Has_Soul

In [None]:
missing(train)['has_soul']

In [None]:
sns.distplot(train.has_soul)

Turns out, these beasts aren't soulless after all

In [None]:
hs_mean = train.has_soul.mean()
hs_std = train.has_soul.std()
ll = hs_mean - 3*hs_std
ul = hs_mean + 3*hs_std

In [None]:
len(train[(train['has_soul'] > ul) | (train['has_soul'] < ll)])

In [None]:
sns.boxplot(train.type, train.has_soul)

We see that ghost have the least amount of soul in them and Ghouls have a quite high percentage of soul

## 5) Color (Categorical)

In [None]:
missing(train)['color']

In [None]:
train.color.value_counts()

### Creature wise distribution of color

In [None]:
train[train.type == 'Ghoul'].color.value_counts(normalize = True)

In [None]:
train[train.type == 'Goblin'].color.value_counts(normalize = True)

In [None]:
train[train.type == 'Ghost'].color.value_counts(normalize = True)

In [None]:
sns.countplot(train.color)

A large proportion of creatures are either White or Transparent

In [None]:
sns.countplot(train.color, hue = train.type)

The above plot gives us a clear visualization of how the different creatures are distributed across the various color categories.

## 6) Type

In [None]:
train.type.value_counts()

In [None]:
c = train.corr()
sns.heatmap(c, annot = True)

In [None]:
sns.heatmap(c[(c>=0.5) | (c<=-0.4)], annot = True)

The confusion matrix gives us some idea about how the different features are correlated to each other

Here we do not see much correlation between features.

In [None]:
df.head()

We have a nominal type of categorical variable in our dataset(color) which we will need to encode to train models for prediction

as this variable is not ordinal we will use one-hot encoding to do this

In [None]:
color_ohe = pd.get_dummies(df.color)
color_ohe.head()

In [None]:
tg = pd.DataFrame()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train.type)
tg['target'] = le.fit_transform(train['type'])
tg.head()

In [None]:
df.drop('color', axis = 1, inplace = True)
df.head()

In [None]:
tr = pd.concat([df, color_ohe], axis = 1)
tr.drop(['type'], axis = 1, inplace = True)
tr.head()

We are done with the basic EDA and the insights we obtained are:

1) Ghouls tend to have longer bones than goblins which have longer bones than ghosts

2) Ghosts tend to have a higher percentage of rotting flesh compared to other creatures

3) Ghouls have longer hair than goblins which in turn have longer hair than Ghosts

4) We see that ghost have the least amount of soul in them and Ghouls have a quite high percentage of soul

5) A large proportion of creatures are either White or Transparent

The data was very clean in this case with no null values and no correlation between columns so the analysis was pretty simple

But that is not the case all the time.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tr, tg, test_size = 0.2)

X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
params = {
    'C': [0.25, 0.5, 0.75, 1, 2],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l1', 'l2']
}

lr = LogisticRegression(max_iter = 5000)
clf = GridSearchCV(lr, params, cv = 5)
clf.fit(tr, tg)

In [None]:
print('The best parameters for Logistic Regression are:',clf.best_params_)
print('The score is:', clf.best_score_)


In [None]:
params = {
    'C': [0.25, 0.5, 0.75, 1, 2],
    'kernel': ['rbf', 'poly', 'linear']
}

svm = SVC()
clf = GridSearchCV(svm, params, cv = 5)
clf.fit(tr, tg)

In [None]:
print('The best parameters for svm are:',clf.best_params_)
print('The score is:', clf.best_score_)

In [None]:
params = {
    'n_estimators': [10,20,30,50],
    'criterion': ['gini', 'entropy']
}

rf = RandomForestClassifier()
clf = GridSearchCV(rf, params, cv = 5)
clf.fit(tr, tg)

In [None]:
print('The best parameters for RandomForestClassifier are:',clf.best_params_)
print('The score is:', clf.best_score_)

In [None]:
test.head()

In [None]:
t = test.drop('id', axis = 1)

In [None]:
col_ohe = pd.get_dummies(t['color'])
t.drop('color', axis = 1, inplace = True)
te = pd.concat([t,col_ohe], axis = 1)
te.head()

In [None]:
tr.head()

In [None]:
lr = LogisticRegression(C = 2, solver = 'liblinear', penalty = 'l1')
lr.fit(tr,tg)
te_pred = lr.predict(te)

In [None]:
sub = pd.DataFrame({'id': test['id'],
                   'type': te_pred})

In [None]:
sub['type'].replace({ 1: 'Ghoul',
                    0: 'Ghost',
                    2: 'Goblin'
                    }, inplace = True)
sub.head()

In [None]:
sub.to_csv('submission.csv', index = False)