In [None]:
# Importing required libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import defaultdict
import scipy.stats as st

In [None]:
# Reading data into a dataframe df
df = pd.read_csv('Cancer.csv', index_col = 'id')

In [None]:
# Meta data containg informaton about the data suggests that only the first 10 Columns contain values, the rest are
# statistics on those values, therefore crating a new dataframe df with only the first 10 columns.
df = df.iloc[:,0:10]

In [None]:
# Making sure that no patient data is duplicated
assert df.index.value_counts().max() == 1

In [None]:
# Converting dignosis to dtype category
df.diagnosis = df.diagnosis.astype('category')

In [None]:
# Melting the data frame to create Parameters and values for each categorical variable
df_melt = pd.melt(df,id_vars='diagnosis', 
                  var_name = 'Parameters', 
                  value_name = 'Values')

In [None]:
# Making new dataframe for radius, texture and parimeter
df_melt_radius_texture_perimeter = df_melt[(df_melt.Parameters == 'radius_mean') | 
                                           (df_melt.Parameters == 'texture_mean') |
                                          (df_melt.Parameters == 'perimeter_mean')]

In [None]:
# Making new dataframe for area
df_melt_area = df_melt[df_melt.Parameters == 'area_mean']

In [None]:
# Making new dataframe for rest of the parameters
df_melt_rest = df_melt[(df_melt.Parameters != 'radius_mean') & 
                       (df_melt.Parameters != 'texture_mean') &
                       (df_melt.Parameters != 'perimeter_mean') &
                      (df_melt.Parameters != 'area_mean')]

In [None]:
# Creating a list of dataframes that will be plotted
dfs = [df_melt_area, df_melt_radius_texture_perimeter, df_melt_rest]

In [None]:
def box_strip_plot(df_list):
    
    """ This function takes a list of melted dataframes as columns and plots a figure...
    containg box plots and strip plots of those dataframes in different...
    columns with 'my personal preferences' for better visualization. This makes plotting...
    multiple dataframes a one step process. The function requres numpy, matplotlib.pyplot, seaborn and...
    defaultdict to be imported beforehand. You can use your own personal preferences by changing...
    values such as axes style, jitter etc in the function body."""
    
    # Importing string
    import string
    
    # Creating string list
    string_list = list(string.ascii_lowercase)
    
    # Creating a dictionary named a
    a = defaultdict()
    
    # Setting axes style and font size
    sns.set(style = 'darkgrid', font_scale = 2)
    
    # Making fig and axs objects based on the number of dataframes
    fig, axs = plt.subplots(len(df_list),1,figsize = (15,20*len(df_list)));
    
    # Setting fig title
    plt.suptitle('Box and strip plots of nuclei features', y = 1.0);
    
    # Creating a list of axes created
    ax_list = list(axs.ravel())
    
    # Making a zip opject with tuples with 2 objects: dataframe and its axes
    zipped = zip(df_list,ax_list)
    
    # Adding values from zipped to the dictionay a
    for ind,values in enumerate(zipped):
        a[str(ind)] = values
    
    # Creating count_loop variable to count no of times the following loop is run
    
    count_loop = 0
    
    # Looping over a to make plots for each dataframe
    for key in a:
        
        # Creating boxplot
        sns.boxplot(y='Values',
        x='Parameters',
        hue='diagnosis',
        data=a[key][0],
        dodge = True,
        ax=a[key][1])
        
        # Setting transparancy of boxplot to 0.3  
        for patch in a[key][1].artists:
            c1, c2, c2, c3 = patch.get_facecolor()
            patch.set_facecolor((c1, c2, c3, .5))
        
        # Creating stripplot
        sns.stripplot(y='Values',
        x='Parameters',
        hue = 'diagnosis',
        data=a[key][0],
        size=3,
        dodge = True,
        jitter = True,
        ax = a[key][1]);
        
        # Removing xlabel and ylabel and setting alphabetical title
        a[key][1].set_xlabel('');
        a[key][1].set_ylabel('Arbitrary Units');
        a[key][1].set_title('(' + string_list[count_loop] + ')',
                            loc = 'left');
        
        # Updating count_loop
        count_loop += 1
    
    # Returning the final figure
    return fig

In [None]:
# Calling box_strip_plot on dataframes list (dfs) to create plotted figure object
fig1 = box_strip_plot(dfs)

In [None]:
# Showing fig1
plt.tight_layout()
plt.show(fig1)

In [None]:
# Saving figure
fig1.savefig('BoxStrip.jpg')

In [None]:
# Creating empty list for storing statistic value of normalcy test
norm_stats = [];

In [None]:
# Creating empty list for storing p value of normalcy test
norm_p = [];

In [None]:
# Looping over all columns(except diagnosis column) in df and checking if they follow a 
# normal distribution
for k in df.iloc[:,1:]:
    # applying normalcy test
    stat_val, p_val = st.normaltest(df[k])
    # appending statistic value to norm_stats 
    norm_stats.append(stat_val)
    # appending p value to norm_p
    norm_p.append(p_val)

In [None]:
# Checking if any distribution is normal
assert (np.array(norm_p) > 0.05).sum() == 0

In [None]:
# Creating a defaultdict to contain p values and U statistic value for mann-whitney test
mw_test = defaultdict()

In [None]:
# Looping over all columns(except diagnosis column) in df and checking if benign is different than
# malignant
for k in df.iloc[:,1:]:
    # Applying mann-whitney test
    statmw_val, pmw_val = st.mannwhitneyu(x = df[df['diagnosis'] == 'M'][k],
                                          y = df[df['diagnosis'] == 'B'][k],
                                          alternative = 'greater')
    # Appending statistic and p value to mw_test 
    mw_test[k] = [statmw_val,pmw_val]

In [None]:
# Creating a data frame for mw_test
stats_df = pd.DataFrame(data=mw_test)

In [None]:
# Setting the index of stats_df to names
stats_df.index = ['U_statistic', 'P_value']

In [None]:
# Saving statistics dataframe to excel file
writer = pd.ExcelWriter('Statistics.xlsx')
stats_df.to_excel(writer,'Sheet1')

In [None]:
# Creating a defaultdict to store percentage increases in parameters
Perc_increase = defaultdict()

In [None]:
# Looping over all columns(except diagnosis column) in df and calculating increase in values in malignant as percentage
for k in df.iloc[:,1:]:
    Perc_increase[k] = ((df[df['diagnosis'] == 'M'][k].median() - df[df['diagnosis'] == 'B'][k].median())
     /(df[df['diagnosis'] == 'B'].loc[:,k].median()))*100  

In [None]:
# Creating figure object
fig2= plt.figure(figsize = (15,20))

In [None]:
# Plotting percentage increase as horizontal bar plot
plt.barh(list(range(2, len(Perc_increase)*3 + 1, 3)),
         width = list(Perc_increase.values()),
         align='center');

In [None]:
# Placing y ticks with labels
plt.yticks(list(range(2, len(Perc_increase)*3 + 1, 3)), list(Perc_increase.keys()));

In [None]:
# Placing x ticks
plt.xticks(list(range(0, 340, 20)));

In [None]:
# Label x axes 
plt.xlabel('Increase (%)');

In [None]:
# Setting figure title
plt.title('Increase in malignant compared to benign');

In [None]:
plt.tight_layout()

In [None]:
plt.show(fig2)

In [None]:
# Saving figure
fig2.savefig('PercentageIncrease.jpg')