# Introduction
This is a brain tumor feature dataset including five first-order features and eight texture features with the target level (in the column Class).

# Content:
[1. Load and Check Data](#1)
[1. Variable Description](#2)
    [1. Univariate Variable Analysis](#3)
        [1. Categorical Variable](#4)
        [1. Numerical Variable](#5)
[1. Basic Data Analysis](#6)
[1. Outlier Detection](#7)
[1. Missing Value](#8)
    [1. Find Missing Value](#9)
    [1. Fill Missing Value](#10)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
#plt.style.available

import seaborn as sns
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 <a id='1'></a>
# Load and Check Data

In [None]:
df = pd.read_csv('/kaggle/input/brain-tumor/Brain Tumor.csv')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.describe

 <a id='2'></a>
# Variable Description

* First Order Features
    * Mean
    * Variance
    * Standard Deviation
    * Skewness
    * Kurtosis

* Second Order Features
    * Contrast
    * Energy
    * ASM (Angular second moment)
    * Entropy
    * Homogeneity
    * Dissimilarity
    * Correlation
    * Coarseness

Image column defines image name and Class column defines either the image has tumor or not (1 = Tumor, 0 = Non-Tumor)

In [None]:
df.info()

     <a id='3'></a>
    # Univariate Variable Analysis
    * Categorical Variable: class, 
    * Numerical Variable: Mean, Variance, Standard Deviation, Entropy, Skewness, Kurtosis, Contrast, Energy, ASM, Homogeneity, Dissimilarity, Correlation, Coarseness

 <a id='4'></a>
# Categorical Variable:

In [None]:
def bar_plot(variable):
    """
        input: variable ex: "Class"
        output: bar plot & value count
    """
    # get feature
    var = df[variable]
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (9,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,varValue))

In [None]:
category1 = ["Class"]
for c in category1:
    bar_plot(c)

 <a id='5'></a>
# Numerical Variable:

In [None]:
def plot_hist(variable):
    plt.figure(figsize=(9,3))
    plt.hist(df[variable],bins=10)
    #plt.hist(df[variable],bins=890)
    """We need to increase the value of bins to examine the graf in e little more detail"""
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

In [None]:
numericVar1=["Mean", "Variance", "Standard Deviation", "Entropy", "Skewness", "Kurtosis", "Contrast", "Energy", "ASM", "Homogeneity", "Dissimilarity", "Correlation", "Coarseness"]
for n in numericVar1:
    plot_hist(n)

 <a id='6'></a>
# Basic Data Analysis
the relationship of all areas with diagnosis area

In [None]:
# Mean vs diagnosis
df[["Mean","Class"]].groupby(["Mean"],as_index=False).mean().sort_values(by="Class",ascending=False)

In [None]:
# Variance vs diagnosis
df[["Variance","Class"]].groupby(["Variance"],as_index=False).mean().sort_values(by="Class",ascending=False)

In [None]:
# Standard Deviation vs diagnosis
df[["Standard Deviation","Class"]].groupby(["Standard Deviation"],as_index=False).mean().sort_values(by="Class",ascending=False)

<a id='7'></a>
# Outlier Detection

In [None]:
def detect_outliers(dfl, features):
    outlier_indices=[]
    
    for c in features:
        #1st quartile
        Q1 = np.percentile(dfl[c],25)
        #3rd quartile
        Q3 = np.percentile(dfl[c],75)
        #IQR
        IQR = Q3 - Q1
        #Outlier step
        outlier_step = IQR * 1.5
        #detect outlier and their indices
        outlier_list_col = dfl[(dfl[c] < Q1 - outlier_step) | (dfl[c] > Q3 + outlier_step)].index
        # store indices
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices) 
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    return multiple_outliers

In [None]:
df.loc[detect_outliers(df,["Mean", "Variance", "Standard Deviation", "Entropy", "Skewness", "Kurtosis", "Contrast", "Energy", "ASM", "Homogeneity", "Dissimilarity", "Correlation", "Coarseness"])]

In [None]:
# drop Outliers
df = df.drop(detect_outliers(df,["Mean", "Variance", "Standard Deviation", "Entropy", "Skewness", "Kurtosis", "Contrast", "Energy", "ASM", "Homogeneity", "Dissimilarity", "Correlation", "Coarseness"]),axis=0).reset_index(drop=True)

 <a id='8'></a>
# Missing Value
* Find Missing Value
* Fill Missing Value

In [None]:
df.head()

 <a id='9'></a>
# Find Missing Value

In [None]:
df.columns[df.isnull().any()]

In [None]:
df.isnull().sum()

There is no missing value.