### Breast Cancer Diagnosis Classification Model
#### By : Shivam Singh

In [2]:
# Import Data Manipulation Libraries
import pandas as pd
import numpy as np

# Import data visualization libaries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Neccessory Libraries
# 1 . Importing filter warning libraries
import warnings
warnings.filterwarnings(action= 'ignore')

# 2. Importing Data logging Libraries
import logging
logging.basicConfig(level=logging.INFO,
                    filename= 'model.log',
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force = True)
#3. Import OrderDict()
from collections import OrderedDict

In [3]:
# Data ingestion 
df = pd.read_csv(r'C:\15Days15Project\Breast-Cancer-Diagnosis-Classification-Model\data\raw\Breast-cancer-data.csv')

df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
# Descriptive stats

def descriptive_stats():
    numerical_col = df.select_dtypes(exclude = 'object').columns
    categorica_col = df.select_dtypes(include = 'object').columns
    num_stats = []
    cat_stats = []
    data_info = []

    for i in numerical_col:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1 
        LF = Q1 - 1.5*IQR
        UF = Q3 + 1.5*IQR

        outlier_count = len(df[(df[i] < LF) | (df[i] > UF)])
        outlier_percentage = outlier_count / len(df[i]) * 100

        numerical_stats = OrderedDict({
            "Feature " : i ,
            "Q1" : Q1,
            "Q3" : Q3,
            "IQR" : IQR,
            "LF" : LF,
            "UF" : UF,
            "Mean" : df[i].mean(),
            "Median" : df[i].median(),
            "Min" : df[i].min(),
            "Max" : df[i].max(),
            "Outlier count" : outlier_count,
            "outlier percentage" : outlier_percentage,
            "standard derivation": df[i].std(),
            "variance" : df[i].var(),
            "skewness" : df[i].skew(),
            "kurtosis" : df[i].kurtosis()
        })
        num_stats.append(numerical_stats)
    numerical_stats_report = pd.DataFrame(num_stats)

    for i in categorica_col:
        categorical_stats = OrderedDict({
            "Feature" : i , 
            "Unquie count" : df[i].nunique(),
            "Value count" : df[i].value_counts(),
            "mode" : df[i].mode()
        })
        cat_stats.append(categorical_stats)
    categorical_stats_report = pd.DataFrame(cat_stats)


    for i in df.columns : 
        data1 = OrderedDict({
            "Feature" : i ,
            "Missing value" : df[i].isnull().sum(),
            "Unqiue value" : df[i].nunique(),
            "value count " : df[i].value_counts().to_dict()
        })
        data_info.append(data1)
    data_info_report = pd.DataFrame(data_info)

    return categorical_stats_report,numerical_stats_report,data_info_report

categorical_stats_report,numerical_stats_report,data_info_report = descriptive_stats()

In [5]:
#Numerical Stats
numerical_stats_report

Unnamed: 0,Feature,Q1,Q3,IQR,LF,UF,Mean,Median,Min,Max,Outlier count,outlier percentage,standard derivation,variance,skewness,kurtosis
0,id,869218.0,8813129.0,7943911.0,-11046650.0,20729000.0,30371830.0,906024.0,8670.0,911320500.0,81,14.235501,125020600.0,1.563015e+16,6.473752,42.193194
1,radius_mean,11.7,15.78,4.08,5.58,21.9,14.12729,13.37,6.981,28.11,14,2.460457,3.524049,12.41892,0.94238,0.845522
2,texture_mean,16.17,21.8,5.63,7.725,30.245,19.28965,18.84,9.71,39.28,7,1.230228,4.301036,18.49891,0.65045,0.758319
3,perimeter_mean,75.17,104.1,28.93,31.775,147.495,91.96903,86.24,43.79,188.5,13,2.28471,24.29898,590.4405,0.99065,0.972214
4,area_mean,420.3,782.7,362.4,-123.3,1326.3,654.8891,551.1,143.5,2501.0,25,4.393673,351.9141,123843.6,1.645732,3.652303
5,smoothness_mean,0.08637,0.1053,0.01893,0.057975,0.133695,0.09636028,0.09587,0.05263,0.1634,6,1.054482,0.01406413,0.0001977997,0.456324,0.855975
6,compactness_mean,0.06492,0.1304,0.06548,-0.0333,0.22862,0.104341,0.09263,0.01938,0.3454,16,2.811951,0.05281276,0.002789187,1.190123,1.65013
7,concavity_mean,0.02956,0.1307,0.10114,-0.12215,0.28241,0.08879932,0.06154,0.0,0.4268,18,3.163445,0.07971981,0.006355248,1.40118,1.998638
8,concave points_mean,0.02031,0.074,0.05369,-0.060225,0.154535,0.04891915,0.0335,0.0,0.2012,10,1.757469,0.03880284,0.001505661,1.17118,1.066556
9,symmetry_mean,0.1619,0.1957,0.0338,0.1112,0.2464,0.1811619,0.1792,0.106,0.304,15,2.636204,0.02741428,0.0007515428,0.725609,1.287933


In [6]:
# Categorical stats
categorical_stats_report

Unnamed: 0,Feature,Unquie count,Value count,mode
0,diagnosis,2,"diagnosis B 357 M 212 Name: count, dtype...","0 B Name: diagnosis, dtype: object"


In [7]:
# Data Info 
data_info_report

Unnamed: 0,Feature,Missing value,Unqiue value,value count
0,id,0,569,"{842302: 1, 842517: 1, 84300903: 1, 84348301: ..."
1,diagnosis,0,2,"{'B': 357, 'M': 212}"
2,radius_mean,0,456,"{12.34: 4, 11.06: 3, 10.26: 3, 12.77: 3, 13.05..."
3,texture_mean,0,479,"{16.84: 3, 19.83: 3, 15.7: 3, 20.52: 3, 18.22:..."
4,perimeter_mean,0,522,"{82.61: 3, 134.7: 3, 87.76: 3, 129.1: 2, 82.69..."
5,area_mean,0,539,"{512.2: 3, 394.1: 2, 399.8: 2, 1076.0: 2, 582...."
6,smoothness_mean,0,474,"{0.1007: 5, 0.1054: 4, 0.115: 4, 0.1075: 4, 0...."
7,compactness_mean,0,537,"{0.1147: 3, 0.1206: 3, 0.1047: 2, 0.1141: 2, 0..."
8,concavity_mean,0,537,"{0.0: 13, 0.1204: 3, 0.2448: 2, 0.08422: 2, 0...."
9,concave points_mean,0,542,"{0.0: 13, 0.02864: 3, 0.02272: 2, 0.1471: 2, 0..."
