# Introduction
HCC dataset was obtained at a University Hospital in Portugal and contais several demographic, risk factors, laboratory and overall survival features of 165 real patients diagnosed with HCC. The dataset contains 49 features selected according to the EASL-EORTC (European Association for the Study of the Liver - European Organisation for Research and Treatment of Cancer) Clinical Practice Guidelines, which are the current state-of-the-art on the management of HCC.

# Content:
[1. Load and Check Data](#1)
[1. Variable Description](#2)
    [1. Univariate Variable Analysis](#3)
        [1. Categorical Variable](#4)
        [1. Numerical Variable](#5)
[1. Basic Data Analysis](#6)
[1. Outlier Detection](#7)
[1. Missing Value](#8)
    [1. Find Missing Value](#9)
    [1. Fill Missing Value](#10)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
#plt.style.available

import seaborn as sns
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 <a id='1'></a>
# Load and Check Data

In [None]:
df = pd.read_csv('/kaggle/input/hcc-survival-data-set/hcc.csv')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.describe

 <a id='2'></a>
# Variable Description

Gender: nominal
Symptoms: nominal
Alcohol: nominal
Hepatitis B Surface Antigen: nominal
Hepatitis B e Antigen: nominal
Hepatitis B Core Antibody: nominal
Hepatitis C Virus Antibody: nominal
Cirrhosis : nominal
Endemic Countries: nominal
Smoking: nominal
Diabetes: nominal
Obesity: nominal
Hemochromatosis: nominal
Arterial Hypertension: nominal
Chronic Renal Insufficiency: nominal
Human Immunodeficiency Virus: nominal
Nonalcoholic Steatohepatitis: nominal
Esophageal Varices: nominal
Splenomegaly: nominal
Portal Hypertension: nominal
Portal Vein Thrombosis: nominal
Liver Metastasis: nominal
Radiological Hallmark: nominal
Age at diagnosis: integer
Grams of Alcohol per day: continuous
Packs of cigarets per year: continuous
Performance Status: ordinal
Encefalopathy degree: ordinal
Ascites degree: ordinal
International Normalised Ratio: continuous
Alpha-Fetoprotein (ng/mL): continuous
Haemoglobin (g/dL): continuous
Mean Corpuscular Volume (fl): continuous
Leukocytes(G/L): continuous
Platelets (G/L): continuous
Albumin (mg/dL): continuous
Total Bilirubin(mg/dL): continuous
Alanine transaminase (U/L): continuous
Aspartate transaminase (U/L): continuous
Gamma glutamyl transferase (U/L): continuous
Alkaline phosphatase (U/L): continuous
Total Proteins (g/dL): continuous
Creatinine (mg/dL): continuous
Number of Nodules: integer
Major dimension of nodule (cm): continuous
Direct Bilirubin (mg/dL): continuous
Iron (mcg/dL): continuous
Oxygen Saturation (%): continuous
Ferritin (ng/mL): continuous
Class: nominal (1 if patient survives, 0 if patient died)

In [None]:
df.info()

 <a id='3'></a>
# Univariate Variable Analysis
* Categorical Variable: 1.Gen,2.Sym, 3.Alc, 4.HepB, 5.HepB, 6.HepB, 7.HepC, 8.Cir, 9.End, 10.Smo, 11.Dia, 12.Obe, 13.Hem, 14.Art, 15.CRen, 16.HIV, 17.Non, 18.EVar, 19.Spl, 20.PHyp, 21.Thr, 22.LMet, 23.Rad,  Class
* Numerical Variable: 24.Agedia, 25.Alcpd, 26.cigpy, 27.Sta, 28.Encdeg, 29.Ascdeg, 30.IntNorRat, 31.Alp, 32.Hae, 33.MCorVol, 34.Leu, 35.Plat, 36.Alb, 37.Bil, 38.Ala, 39.Aspa, 40.Gam, 41.Alk, 42.Prot, 43.Crea, 44.NNod, 45.dnod, 46.Bil, 47.Iro, 48.Oxy, 49.Fer

<a id='4'></a>
# Categorical Variable:

In [None]:
def bar_plot(variable):
    """
        input: variable ex: "Sex"
        output: bar plot & value count
    """
    # get feature
    var = df[variable]
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (9,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,varValue))

In [None]:
category1 = ["1.Gen","2.Sym", "3.Alc", "4.HepB", "5.HepB", "6.HepB", "7.HepC", "8.Cir", "9.End", "10.Smo", "11.Dia", "12.Obe"]
for c in category1:
    bar_plot(c)

In [None]:
category2 = ["13.Hem", "14.Art", "15.CRen", "16.HIV", "17.Non", "18.EVar", "19.Spl", "20.PHyp", "21.Thr", "22.LMet", "23.Rad",  "Class"]
for c in category2:
    print("{}: \n".format(df[c].value_counts()))

<a id='5'></a>
# Numerical Variable:

In [None]:
def plot_hist(variable):
    plt.figure(figsize=(9,3))
    plt.hist(df[variable],bins=10)
    # plt.hist(train_df[variable],bins=890)
    """We need to increase the value of bins to examine the graf in e little more detail"""
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

In [None]:
numericVar=["24.Agedia", "25.Alcpd", "26.cigpy", "27.Sta", "28.Encdeg", "29.Ascdeg"]
for n in numericVar:
    plot_hist(n)

In [None]:
numericVar2=["30.IntNorRat", "31.Alp", "32.Hae", "33.MCorVol", "34.Leu", "35.Plat", "36.Alb", "37.Bil", "38.Ala", "39.Aspa", "40.Gam", "41.Alk", "42.Prot", "43.Crea", "44.NNod", "45.dnod", "46.Bil", "47.Iro", "48.Oxy", "49.Fer"]
for n in numericVar:
    plot_hist(n)

 <a id='6'></a>
# Basic Data Analysis
* Gender - Class
* Sym - Class

In [None]:
# Class - Gender
df[["1.Gen","Class"]].groupby(["1.Gen"],as_index=False).mean().sort_values(by="Class",ascending=False)

In [None]:
# Class - Gender
df[["2.Sym","Class"]].groupby(["2.Sym"],as_index=False).mean().sort_values(by="Class",ascending=False)

<a id='7'></a>
# Outlier Detection

In [None]:
def detect_outliers(dfrm, features):
    outlier_indices=[]
    
    for c in features:
        #1st quartile
        Q1 = np.percentile(dfrm[c],25)
        #3rd quartile
        Q3 = np.percentile(dfrm[c],75)
        #IQR
        IQR = Q3 - Q1
        #Outlier step
        outlier_step = IQR * 1.5
        #detect outlier and their indices
        outlier_list_col = dfrm[(dfrm[c] < Q1 - outlier_step) | (dfrm[c] > Q3 + outlier_step)].index
        # store indices
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices) 
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 10)
    return multiple_outliers

In [None]:
df[df.columns] = df[df.columns].replace({'?':np.nan})
df2 = df.apply(pd.to_numeric) # convert all columns of DataFrame

In [None]:
df2.columns

In [None]:
df2.loc[detect_outliers(df2,["1.Gen", "2.Sym", "3.Alc", "4.HepB", "5.HepB", "6.HepB", "7.HepC","8.Cir", "9.End", "10.Smo", "11.Dia", "12.Obe", "13.Hem, “14.Art”,”15.CRen”, “16.HIV”, “17.Non”, “18.EVar”, “19.Spl”, “20.PHyp”, “21.Thr”,”22.LMet”, “23.Rad”, “24.Agedia”, “25.Alcpd”, “26.cigpy”, “27.Sta”,”28.Encdeg”, “29.Ascdeg”, “ 30.IntNorRat”, “ 31.Alp”, “ 32.Hae”,” 33.MCorVol”, “ 34.Leu”, “35.Plat”, “36.Alb”, “37.Bil”, “38.Ala”,”39.Aspa”, “40.Gam, "41.Alk", "42.Prot", "43.Crea", "44.NNod","45.dnod", "46.Bil", "47.Iro", "48.Oxy", "49.Fer", "Class")]

 <a id='8'></a>
# Missing Value
* Find Missing Value
* Fill Missing Value

 <a id='9'></a>
# Find Missing Value

In [None]:
df.head()

In [None]:
df.columns[df.isnull().any()]

In [None]:
df[df.columns] = df[df.columns].replace({'?':np.nan})

In [None]:
df=df.apply(pd.to_numeric)

In [None]:
df.head()

In [None]:
df.columns[df.isnull().any()]

In [None]:
#number of null values
df.isnull().sum()

 <a id='10'></a>
# Fill Missing Value
* 29. Ascdeg has 2 missing value
* 28.Encdeg has 1

In [None]:
df.boxplot(column="29.Ascdeg",by = "28.Encdeg")
plt.show()