In [1]:
from numpy.random import seed
from numpy.random import randn
from scipy.stats import mannwhitneyu
import numpy as np
from scipy.stats import norm

import pandas as pd

In [2]:
classesWithDesignPatterns = pd.read_csv('../data/classes_design-patterns.csv', sep=';')
classesWithDesignPatterns.head()

Unnamed: 0,sistema,classe,Bridge,Composite,ChainOfResponsibility,TemplateMethod,FactoryMethod,Prototype,cbo,wmc,dit,rfc,nom,loc,GC,RPB,FE,LM
0,jasperreports,net.sf.jasperreports.charts.JRCategorySeries,0,0,1,0,1,0,3,5,1,0,5,12,0,0,0,0
1,jasperreports,net.sf.jasperreports.charts.JRChartAxis,0,0,1,0,1,0,3,4,1,0,4,13,0,0,0,0
2,jasperreports,net.sf.jasperreports.charts.JRDataRange,0,0,1,0,0,0,2,2,1,0,2,8,0,0,0,0
3,jasperreports,net.sf.jasperreports.charts.JRGanttSeries,0,0,1,0,1,0,2,8,1,0,8,13,0,0,0,0
4,jasperreports,net.sf.jasperreports.charts.JRItemLabel,0,0,1,0,0,0,3,4,1,0,4,12,0,0,0,0


In [3]:
classesNoDesignPatterns = pd.read_csv('../data/classes_no-design-patterns.csv', sep=';')
classesNoDesignPatterns.head()

Unnamed: 0,file,class,type,cbo,wmc,dit,rfc,nom,loc
0,C:\Users\Win-7\Documents\Mestrado\Sistemas\jas...,AlterDesignApp,class,15,5,2,21,5,61
1,C:\Users\Win-7\Documents\Mestrado\Sistemas\jas...,com.bar.SomeBarClass,class,0,1,1,0,1,8
2,C:\Users\Win-7\Documents\Mestrado\Sistemas\jas...,com.foo.SomeFooClass,class,0,1,1,0,1,8
3,C:\Users\Win-7\Documents\Mestrado\Sistemas\jas...,BarbecueApp,class,21,18,2,35,18,206
4,C:\Users\Win-7\Documents\Mestrado\Sistemas\jas...,Barcode4JApp,class,21,18,2,35,18,206


In [4]:
metrics = ['cbo', 'wmc', 'dit', 'rfc', 'loc', 'nom']

for metric in metrics:
    U1, pnorm = mannwhitneyu(pd.to_numeric(classesWithDesignPatterns[metric]), pd.to_numeric(classesNoDesignPatterns[metric]))
    nx, ny = len(classesWithDesignPatterns), len(classesNoDesignPatterns)
    U2 = nx*ny - U1
    
    print('Metric: ' + metric)
    print('U1=%.3f, U2=%.3f, p=%.10f' % (U1, U2, pnorm))
    
    alpha = 0.05
    
    if pnorm > alpha:
        print('Same distribution (fail to reject H0)')
    else:
        print('Different distribution (reject H0)')
        
    print('---------------------------------')

Metric: cbo
U1=6203916.000, U2=5331757.000, p=0.0000000829
Different distribution (reject H0)
---------------------------------
Metric: wmc
U1=5681821.000, U2=5853852.000, p=0.2907889992
Same distribution (fail to reject H0)
---------------------------------
Metric: dit
U1=4963052.000, U2=6572621.000, p=0.0000000000
Different distribution (reject H0)
---------------------------------
Metric: rfc
U1=5653386.500, U2=5882286.500, p=0.1585849160
Same distribution (fail to reject H0)
---------------------------------
Metric: loc
U1=5611804.500, U2=5923868.500, p=0.0554979548
Same distribution (fail to reject H0)
---------------------------------
Metric: nom
U1=5610148.500, U2=5925524.500, p=0.0525571254
Same distribution (fail to reject H0)
---------------------------------


In [5]:
for metric in metrics:
    U1, pnorm = mannwhitneyu(pd.to_numeric(classesWithDesignPatterns[metric]), pd.to_numeric(classesNoDesignPatterns[metric]), method="asymptotic")
    nx, ny = len(classesWithDesignPatterns), len(classesNoDesignPatterns)
    U2 = nx*ny - U1
    U = min(U1, U2)
    N = nx + ny
    z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
    pCalculated = 2 * norm.cdf(z)  # use CDF to get p-value from smaller statistic
    print('Metric: ' + metric)
    print('pnorm calculated: %.10f' % pCalculated)
    print('U1=%.3f, U2=%.3f, p=%.10f' % (U1, U2, pnorm))
    
    alpha = pCalculated
    
    if pnorm > alpha:
        print('Same distribution (fail to reject H0)')
    else:
        print('Different distribution (reject H0)')
        
    print('---------------------------------')

Metric: cbo
pnorm calculated: 0.0000000925
U1=6203916.000, U2=5331757.000, p=0.0000000829
Different distribution (reject H0)
---------------------------------
Metric: wmc
pnorm calculated: 0.2921279224
U1=5681821.000, U2=5853852.000, p=0.2907889992
Different distribution (reject H0)
---------------------------------
Metric: dit
pnorm calculated: 0.0000000000
U1=4963052.000, U2=6572621.000, p=0.0000000000
Different distribution (reject H0)
---------------------------------
Metric: rfc
pnorm calculated: 0.1610002723
U1=5653386.500, U2=5882286.500, p=0.1585849160
Different distribution (reject H0)
---------------------------------
Metric: loc
pnorm calculated: 0.0560059841
U1=5611804.500, U2=5923868.500, p=0.0554979548
Different distribution (reject H0)
---------------------------------
Metric: nom
pnorm calculated: 0.0534495638
U1=5610148.500, U2=5925524.500, p=0.0525571254
Different distribution (reject H0)
---------------------------------
