# Calling various functions in the pre_training_analysis_tools file to display the results here

In [17]:
# import external modules
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# import local modules.e
from utils import data_loader, pre_training_analysis_tools


In [18]:
# data - both x and y 
data = data_loader.load_real_dataset()

# getting the x or y component of the data
data_x = data['test_x']
data_y = data['test_y']

In [19]:
# This gets the list of features (not well formatted) after removing the ones that have a 
# variance below the min threshold (by default threshold is 0, can cahnge in the variancethreshold)

from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=1.0)
print("Original feature shape:", data_x.shape)
new_X = selector.fit_transform(data_x)
print("Transformed feature shape:", new_X.shape)
data_x = data_x.loc[:, selector.get_support()]

print(data_x.columns)



Original feature shape: (52, 260)
Transformed feature shape: (52, 148)
Index(['publicFieldsQty_sum', 'publicFieldsQty_stdev', 'publicFieldsQty_max',
       'totalFieldsQty_sum', 'totalFieldsQty_max', 'finalFieldsQty_sum',
       'finalFieldsQty_stdev', 'finalFieldsQty_max', 'lambdasQty_sum',
       'lambdasQty_max',
       ...
       'innerClassesQty_max', 'comparisonsQty_sum', 'comparisonsQty_average',
       'comparisonsQty_stdev', 'comparisonsQty_max', 'totalMethodsQty_sum',
       'totalMethodsQty_average', 'totalMethodsQty_stdev',
       'totalMethodsQty_median', 'totalMethodsQty_max'],
      dtype='object', length=148)


In [20]:
# since the function below requires both the x and y in a dataframe, this goes through some code to achieve that 
x = data_x
y = data_y
#concat two dataframes for better visualization 
pd.set_option('display.max_rows', None)
matrix = pd.concat([x,y],axis=1)

# calling function to remove highly correlated features - by removing the one that is less
# highly correlated with the target/dependent variable
pre_training_analysis_tools.remove_collinear_features(matrix, 'maven_reuse', 0.90, 'True')

publicFieldsQty_max | publicFieldsQty_stdev | 0.97
publicFieldsQty_max: -0.064
publicFieldsQty_stdev: -0.055
dropped: publicFieldsQty_max
-----------------------------------------------------------------------------
totalFieldsQty_max | totalFieldsQty_sum | 0.92
totalFieldsQty_max: -0.039
totalFieldsQty_sum: -0.086
dropped: totalFieldsQty_sum
-----------------------------------------------------------------------------
finalFieldsQty_sum | publicFieldsQty_sum | 0.97
finalFieldsQty_sum: -0.124
publicFieldsQty_sum: -0.111
dropped: finalFieldsQty_sum
-----------------------------------------------------------------------------
finalFieldsQty_stdev | publicFieldsQty_stdev | 1.0
finalFieldsQty_stdev: -0.059
publicFieldsQty_stdev: -0.055
dropped: finalFieldsQty_stdev
-----------------------------------------------------------------------------
finalFieldsQty_stdev | publicFieldsQty_max | 0.97
finalFieldsQty_stdev: -0.059
publicFieldsQty_max: -0.064
dropped: publicFieldsQty_max
--------------

modifiers_sum | protectedMethodsQty_sum | 0.98
modifiers_sum: -0.092
protectedMethodsQty_sum: -0.104
dropped: protectedMethodsQty_sum
-----------------------------------------------------------------------------
modifiers_sum | defaultMethodsQty_sum | 0.9
modifiers_sum: -0.092
defaultMethodsQty_sum: -0.09
dropped: modifiers_sum
-----------------------------------------------------------------------------
modifiers_sum | loopQty_sum | 0.93
modifiers_sum: -0.092
loopQty_sum: -0.114
dropped: loopQty_sum
-----------------------------------------------------------------------------
modifiers_sum | staticFieldsQty_sum | 0.92
modifiers_sum: -0.092
staticFieldsQty_sum: -0.135
dropped: staticFieldsQty_sum
-----------------------------------------------------------------------------
modifiers_sum | uniqueWordsQty_sum | 0.99
modifiers_sum: -0.092
uniqueWordsQty_sum: -0.114
dropped: uniqueWordsQty_sum
-----------------------------------------------------------------------------
modifiers_sum | mat

publicMethodsQty_sum | loopQty_sum | 0.96
publicMethodsQty_sum: -0.118
loopQty_sum: -0.114
dropped: publicMethodsQty_sum
-----------------------------------------------------------------------------
publicMethodsQty_sum | tcc_sum | 0.91
publicMethodsQty_sum: -0.118
tcc_sum: 0.097
dropped: publicMethodsQty_sum
-----------------------------------------------------------------------------
publicMethodsQty_sum | staticFieldsQty_sum | 0.97
publicMethodsQty_sum: -0.118
staticFieldsQty_sum: -0.135
dropped: staticFieldsQty_sum
-----------------------------------------------------------------------------
publicMethodsQty_sum | uniqueWordsQty_sum | 0.98
publicMethodsQty_sum: -0.118
uniqueWordsQty_sum: -0.114
dropped: publicMethodsQty_sum
-----------------------------------------------------------------------------
publicMethodsQty_sum | privateMethodsQty_sum | 0.91
publicMethodsQty_sum: -0.118
privateMethodsQty_sum: -0.131
dropped: privateMethodsQty_sum
------------------------------------------

logStatementsQty_sum: -0.113
uniqueWordsQty_sum: -0.114
dropped: uniqueWordsQty_sum
-----------------------------------------------------------------------------
logStatementsQty_sum | mathOperationsQty_sum | 0.93
logStatementsQty_sum: -0.113
mathOperationsQty_sum: -0.137
dropped: mathOperationsQty_sum
-----------------------------------------------------------------------------
logStatementsQty_sum | stringLiteralsQty_sum | 0.96
logStatementsQty_sum: -0.113
stringLiteralsQty_sum: -0.13
dropped: stringLiteralsQty_sum
-----------------------------------------------------------------------------
logStatementsQty_sum | modifiers_sum | 0.96
logStatementsQty_sum: -0.113
modifiers_sum: -0.092
dropped: logStatementsQty_sum
-----------------------------------------------------------------------------
logStatementsQty_sum | assignmentsQty_sum | 0.97
logStatementsQty_sum: -0.113
assignmentsQty_sum: -0.129
dropped: assignmentsQty_sum
---------------------------------------------------------------

tryCatchQty_sum: -0.113
finalMethodsQty_sum: -0.078
dropped: tryCatchQty_sum
-----------------------------------------------------------------------------
tryCatchQty_stdev | lcom_average | 0.92
tryCatchQty_stdev: -0.048
lcom_average: -0.066
dropped: lcom_average
-----------------------------------------------------------------------------
loc_sum | publicFieldsQty_sum | 0.98
loc_sum: -0.124
publicFieldsQty_sum: -0.111
dropped: loc_sum
-----------------------------------------------------------------------------
loc_sum | finalFieldsQty_sum | 0.98
loc_sum: -0.124
finalFieldsQty_sum: -0.124
dropped: finalFieldsQty_sum
-----------------------------------------------------------------------------
loc_sum | wmc_sum | 0.99
loc_sum: -0.124
wmc_sum: -0.123
dropped: loc_sum
-----------------------------------------------------------------------------
loc_sum | protectedMethodsQty_sum | 0.96
loc_sum: -0.124
protectedMethodsQty_sum: -0.104
dropped: loc_sum
---------------------------------------

abstractMethodsQty_average: -0.14
publicMethodsQty_average: -0.136
dropped: abstractMethodsQty_average
-----------------------------------------------------------------------------
abstractMethodsQty_average | loc_average | 0.93
abstractMethodsQty_average: -0.14
loc_average: -0.129
dropped: abstractMethodsQty_average
-----------------------------------------------------------------------------
abstractMethodsQty_stdev | publicMethodsQty_stdev | 1.0
abstractMethodsQty_stdev: -0.101
publicMethodsQty_stdev: -0.099
dropped: abstractMethodsQty_stdev
-----------------------------------------------------------------------------
abstractMethodsQty_stdev | lcom_average | 0.93
abstractMethodsQty_stdev: -0.101
lcom_average: -0.066
dropped: abstractMethodsQty_stdev
-----------------------------------------------------------------------------
abstractMethodsQty_max | publicMethodsQty_max | 0.99
abstractMethodsQty_max: -0.009
publicMethodsQty_max: -0.002
dropped: abstractMethodsQty_max
-------------

returnQty_sum | privateMethodsQty_sum | 0.91
returnQty_sum: -0.13
privateMethodsQty_sum: -0.131
dropped: privateMethodsQty_sum
-----------------------------------------------------------------------------
returnQty_sum | mathOperationsQty_sum | 0.94
returnQty_sum: -0.13
mathOperationsQty_sum: -0.137
dropped: mathOperationsQty_sum
-----------------------------------------------------------------------------
returnQty_sum | stringLiteralsQty_sum | 0.95
returnQty_sum: -0.13
stringLiteralsQty_sum: -0.13
dropped: stringLiteralsQty_sum
-----------------------------------------------------------------------------
returnQty_sum | modifiers_sum | 0.95
returnQty_sum: -0.13
modifiers_sum: -0.092
dropped: returnQty_sum
-----------------------------------------------------------------------------
returnQty_sum | synchronizedMethodsQty_sum | 0.91
returnQty_sum: -0.13
synchronizedMethodsQty_sum: -0.113
dropped: returnQty_sum
----------------------------------------------------------------------------

rfc_average: -0.182
variablesQty_average: -0.148
dropped: rfc_average
-----------------------------------------------------------------------------
rfc_median | wmc_median | 0.91
rfc_median: -0.114
wmc_median: -0.1
dropped: rfc_median
-----------------------------------------------------------------------------
rfc_median | assignmentsQty_median | 0.92
rfc_median: -0.114
assignmentsQty_median: -0.134
dropped: assignmentsQty_median
-----------------------------------------------------------------------------
rfc_median | loc_median | 0.94
rfc_median: -0.114
loc_median: -0.108
dropped: rfc_median
-----------------------------------------------------------------------------
protectedFieldsQty_sum | publicFieldsQty_sum | 0.94
protectedFieldsQty_sum: -0.137
publicFieldsQty_sum: -0.111
dropped: protectedFieldsQty_sum
-----------------------------------------------------------------------------
protectedFieldsQty_sum | finalFieldsQty_sum | 0.98
protectedFieldsQty_sum: -0.137
finalFieldsQty_su

innerClassesQty_max | innerClassesQty_stdev | 0.93
innerClassesQty_max: 0.043
innerClassesQty_stdev: 0.133
dropped: innerClassesQty_max
-----------------------------------------------------------------------------
comparisonsQty_sum | publicFieldsQty_sum | 0.97
comparisonsQty_sum: -0.134
publicFieldsQty_sum: -0.111
dropped: comparisonsQty_sum
-----------------------------------------------------------------------------
comparisonsQty_sum | finalFieldsQty_sum | 0.94
comparisonsQty_sum: -0.134
finalFieldsQty_sum: -0.124
dropped: comparisonsQty_sum
-----------------------------------------------------------------------------
comparisonsQty_sum | wmc_sum | 0.98
comparisonsQty_sum: -0.134
wmc_sum: -0.123
dropped: comparisonsQty_sum
-----------------------------------------------------------------------------
comparisonsQty_sum | protectedMethodsQty_sum | 0.95
comparisonsQty_sum: -0.134
protectedMethodsQty_sum: -0.104
dropped: comparisonsQty_sum
----------------------------------------------

totalMethodsQty_sum | modifiers_sum | 0.97
totalMethodsQty_sum: -0.118
modifiers_sum: -0.092
dropped: totalMethodsQty_sum
-----------------------------------------------------------------------------
totalMethodsQty_sum | assignmentsQty_sum | 0.99
totalMethodsQty_sum: -0.118
assignmentsQty_sum: -0.129
dropped: assignmentsQty_sum
-----------------------------------------------------------------------------
totalMethodsQty_sum | parenthesizedExpsQty_sum | 0.91
totalMethodsQty_sum: -0.118
parenthesizedExpsQty_sum: -0.132
dropped: parenthesizedExpsQty_sum
-----------------------------------------------------------------------------
totalMethodsQty_sum | cbo_sum | 0.99
totalMethodsQty_sum: -0.118
cbo_sum: -0.116
dropped: totalMethodsQty_sum
-----------------------------------------------------------------------------
totalMethodsQty_sum | publicMethodsQty_sum | 1.0
totalMethodsQty_sum: -0.118
publicMethodsQty_sum: -0.118
dropped: publicMethodsQty_sum
----------------------------------------

Unnamed: 0,publicFieldsQty_stdev,totalFieldsQty_max,lambdasQty_sum,anonymousClassesQty_max,wmc_median,wmc_max,protectedMethodsQty_stdev,protectedMethodsQty_max,defaultMethodsQty_sum,defaultMethodsQty_stdev,...,numbersQty_stdev,numbersQty_max,rfc_stdev,rfc_max,protectedFieldsQty_stdev,protectedFieldsQty_max,innerClassesQty_stdev,comparisonsQty_average,comparisonsQty_max,maven_reuse
0,2.673906,5.0,910.0,36.0,2.0,359.0,0.944325,10.0,1663.0,2.68547,...,17.487237,244.0,19.193465,217.0,3.343955,45.0,2.217386,2.964761,143.0,258.0
2,0.845867,5.0,3968.0,27.0,3.0,190.0,1.176706,35.0,595.0,3.251624,...,13.535341,421.0,15.80162,444.0,1.10784,22.0,0.368349,0.196324,31.0,1694.0
4,5.347589,5.0,0.0,7.0,1.0,591.0,0.884498,23.0,848.0,2.107131,...,30.820201,534.0,18.49535,230.0,2.027167,41.0,12.969737,3.193182,224.0,4109.0
6,2.993386,2.0,0.0,10.0,8.0,262.0,0.955287,7.0,51.0,1.015517,...,902.691913,10916.0,12.089748,56.0,3.207932,19.0,0.51709,2.726667,62.0,3003.0
10,1.217302,0.0,0.0,11.0,2.0,1308.0,0.412686,12.0,285.0,0.809673,...,16.396929,713.0,7.828317,241.0,7.635811,300.0,0.90319,0.885872,599.0,345.0
12,2.011677,8.0,0.0,333.0,2.0,1401.0,1.009771,23.0,3514.0,2.013364,...,270.229818,20020.0,15.521188,365.0,1.208898,17.0,1.989889,0.459386,95.0,5796.0
14,0.516232,0.0,0.0,3.0,4.0,48.0,0.723438,5.0,26.0,0.564186,...,6.19559,42.0,6.174382,36.0,1.299454,8.0,0.864961,0.361607,24.0,3616.0
15,0.516232,0.0,0.0,3.0,4.0,48.0,0.723438,5.0,26.0,0.564186,...,6.19559,42.0,6.174382,36.0,1.299454,8.0,0.864961,0.361607,24.0,4555.0
16,1.157858,4.0,0.0,177.0,4.0,868.0,2.210346,124.0,1836.0,3.990683,...,13.173941,609.0,21.725161,689.0,3.078893,93.0,0.78312,1.529737,363.0,229.0
19,0.867497,0.0,0.0,0.0,5.0,94.0,0.550201,2.0,4.0,0.571429,...,3.004815,15.0,11.55054,44.0,1.419914,9.0,0.961858,5.265306,51.0,1501.0


In [None]:
# supposdely works - but takes very long and gets stuck 
# cool tool using pandas profiling - is able to generate a report that includes
# scatter plots, correlation matrixes, identifying highly correlated features, features
# with only unique values, features with zero's etc

import pandas_profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(matrix)
profile.to_widgets()



