In [None]:
# importing required libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import defaultdict
import scipy.stats as st

In [None]:
# reading data into a dataframe df
df = pd.read_csv('Cancer.csv', index_col = 'id')

In [None]:
# exploring the first 5 rows of df
df.head()

In [None]:
# checking for types and null values
df.info()

In [None]:
# meta data containg informaton about the data suggests that only the first 10 Columns contain values, the rest are
# statistics on those values, therefore crating a new dataframe df with only the first 10 columns.
df = df.iloc[:,0:10]

In [None]:
# making sure that no patient data is duplicated
assert df.index.value_counts().max() == 1

In [None]:
# converting dignosis to dtype category
df.diagnosis = df.diagnosis.astype('category')

In [None]:
# melting the data frame to create Parameters and values for each categorical variable
df_melt = pd.melt(df,id_vars='diagnosis', 
                  var_name = 'Parameters', 
                  value_name = 'Values')

In [None]:
# exploring the head of melted dataframe
df_melt.head()

In [None]:
# setting font size and style
sns.set(style = 'darkgrid',font_scale = 2)

In [None]:
# making figure
fig1 = plt.figure(figsize = (15,15));

In [None]:
# making boxplot for the melted dataframe
sns.boxplot(x='Values',
            y='Parameters',
            hue='diagnosis',
            data=df_melt,
            orient='horizontal',
            dodge = True);

In [None]:
# making stripplot for melted dataframe
sns.stripplot(x='Values',
              y='Parameters',
              hue = 'diagnosis',
              data=df_melt,size=3,
              dodge = True,
              jitter = True);

In [None]:
# removing xlabels
plt.xlabel('');

In [None]:
# removing ylabels
plt.ylabel('');

In [None]:
plt.tight_layout()

In [None]:
plt.show(fig1);

In [None]:
# removing area_mean from melted dataframe
df_melt_no_area = df_melt[df_melt.Parameters != 'area_mean']

In [None]:
# making figure for plotting
fig2 = plt.figure(figsize = (15,15));

In [None]:
# creating boxplot for df_melt_no_area
sns.boxplot(x='Values',
            y='Parameters',
            hue='diagnosis',
            data=df_melt_no_area,
            orient='horizontal',
            dodge = True);

In [None]:
# creating stripplot for df_melt_no_area
sns.stripplot(x='Values',
              y='Parameters',
              hue = 'diagnosis',
              data=df_melt_no_area,size=3,
              dodge = True,
              jitter = True);

In [None]:
# removing xlabel
plt.xlabel('');

In [None]:
# removing ylabel
plt.ylabel('');

In [None]:
plt.tight_layout()

In [None]:
plt.show(fig2);

In [None]:
# making new dataframe for radius, texture and parimeter
df_melt_radius_texture_perimeter = df_melt[(df_melt.Parameters == 'radius_mean') | 
                                           (df_melt.Parameters == 'texture_mean') |
                                          (df_melt.Parameters == 'perimeter_mean')]

In [None]:
# making new dataframe for area
df_melt_area = df_melt[df_melt.Parameters == 'area_mean']

In [None]:
# making new dataframe for rest of the parameters
df_melt_rest = df_melt[(df_melt.Parameters != 'radius_mean') & 
                       (df_melt.Parameters != 'texture_mean') &
                       (df_melt.Parameters != 'perimeter_mean') &
                      (df_melt.Parameters != 'area_mean')]

In [None]:
# creating figure and axes for plotting
fig3, (ax1, ax2, ax3) = plt.subplots(3,1,figsize = (15,45));

In [None]:
# creating boxplot for df_melt_area
sns.boxplot(y='Values',
            x='Parameters',
            hue='diagnosis',
            data=df_melt_area,
            dodge = True,ax=ax1);

In [None]:
# creating stripplot for df_melt_area
sns.stripplot(y='Values',
              x='Parameters',
              hue = 'diagnosis',
              data=df_melt_area,size=3,
              dodge = True,
              jitter = True,
              ax = ax1);

In [None]:
# creating boxplot for df_melt_radius_texture_perimeter
sns.boxplot(y='Values',
            x='Parameters',
            hue='diagnosis',
            data=df_melt_radius_texture_perimeter,
            dodge = True,
            ax=ax2);

In [None]:
# creating stripplot for df_melt_radius_texture_perimeter
sns.stripplot(y='Values',
              x='Parameters',
              hue = 'diagnosis',
              data=df_melt_radius_texture_perimeter,size=3,
              dodge = True,
              jitter = True,
              ax = ax2);

In [None]:
# creating boxplot for df_rest
sns.boxplot(y='Values',
            x='Parameters',
            hue='diagnosis',
            data=df_melt_rest,
            dodge = True,ax=ax3);

In [None]:
# creating stripplot for df_rest
sns.stripplot(y='Values',
              x='Parameters',
              hue = 'diagnosis',
              data=df_melt_rest,
              size=3,
              dodge = True,
              jitter = True,
              ax = ax3);

In [None]:
# removing xlabels
ax1.set_xlabel(''); ax2.set_xlabel(''); ax3.set_xlabel('');

In [None]:
# removing ylabels
ax1.set_ylabel(''); ax2.set_ylabel(''); ax3.set_ylabel('');

In [None]:
plt.tight_layout()

In [None]:
plt.show(fig3);

In [None]:
# creating empty list for storing statistic value of normalcy test
norm_stats = [];

In [None]:
# creating empty list for storing p value of normalcy test
norm_p = [];

In [None]:
# looping over all columns(except diagnosis column) in df and checking if they follow a 
# normal distribution
for k in df.iloc[:,1:]:
    # applying normalcy test
    stat_val, p_val = st.normaltest(df[k])
    # appending statistic value to norm_stats 
    norm_stats.append(stat_val)
    # appending p value to norm_p
    norm_p.append(p_val)

In [None]:
# checking if any distribution is normal
assert (np.array(norm_p) > 0.05).sum() == 0

In [None]:
# creating a defaultdict to contain p values and statistic value for mann-whitney test
mw_test = defaultdict()

In [None]:
# looping over all columns(except diagnosis column) in df and checking if benign is different than
# malignant
for k in df.iloc[:,1:]:
    # applying mann-whitney test
    statmw_val, pmw_val = st.mannwhitneyu(x = df[df['diagnosis'] == 'M'][k],
                                          y = df[df['diagnosis'] == 'B'][k],
                                          alternative = 'greater')
    # appending statistic and p value to mw_test 
    mw_test[k] = [statmw_val,pmw_val]

In [None]:
# creating a data frame for mw_test
stats_df = pd.DataFrame(data=mw_test)

In [None]:
stats_df

In [None]:
# setting the index of stats_df to names
stats_df.index = ['U_statistic', 'P_value']

In [None]:
stats_df

In [None]:
# creating a defaultdict to store percentage increases in parameters
Perc_increase = defaultdict()

In [None]:
# looping over all columns(except diagnosis column) in df and calculating increase in values in malignant as percentage
for k in df.iloc[:,1:]:
    Perc_increase[k] = ((df[df['diagnosis'] == 'M'][k].median() - df[df['diagnosis'] == 'B'][k].median())
     /(df[df['diagnosis'] == 'B'].loc[:,k].median()))*100  

In [None]:
fig4= plt.figure(figsize = (15,15))

In [None]:
# plotting percentage increase as horizontal bar plot
plt.barh(list(range(2, len(Perc_increase)*3 + 1, 3)),
         width = list(Perc_increase.values()),
         align='center');

In [None]:
# placing y ticks with labels
plt.yticks(list(range(2, len(Perc_increase)*3 + 1, 3)), list(Perc_increase.keys()));

In [None]:
# placing x ticks
plt.xticks(list(range(0, 340, 20)));

In [None]:
# label x axes 
plt.xlabel('Increase (%)');

In [None]:
plt.tight_layout()

In [None]:
plt.show(fig4)