In [2]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
loc,22.0,14.0,11.0,8.0,11.0
v(g),3.0,2.0,2.0,1.0,2.0
ev(g),1.0,1.0,1.0,1.0,1.0
iv(g),2.0,2.0,2.0,1.0,2.0
n,60.0,32.0,45.0,23.0,17.0
v,278.63,151.27,197.65,94.01,60.94
l,0.06,0.14,0.11,0.19,0.18
d,19.56,7.0,8.05,5.25,5.63
i,14.25,21.11,22.76,17.86,12.44


In [None]:
names = {
    'loc': 'line_of_code',
    'v(g)': '1', 'ev(g)': '2', 'iv(g)': '3',
    'n': 'halstead_total_operators_operands',
    'v': 'halstead_volume',
    'l': 'halstead_program_length',
    'd': 'halstead_difficulty',
    'i': 'halstead_intelligence',
    'e': 'halstead_effort',
    'b': '5',
    't': 'halstead_time',
    'lOCode': 'halstead_line_of_code',
    'lOComment': 'halstead_line_of_commands',
    'lOBlank': 'halstead_blank_lines',
    'locCodeAndComment': '6',
    'uniq_Op': 'unique_operators',
    'uniq_Opnd': 'unique_operands',
    'total_Op': 'total_operators',
    'total_Opnd': 'total operands',
    'branchCount': '7'
}

In [7]:
df[df['loc'] != df['lOCode']][['loc', 'lOCode']]

Unnamed: 0,loc,lOCode
0,22.0,17
1,14.0,11
2,11.0,8
3,8.0,4
4,11.0,7
...,...,...
101758,10.0,7
101759,27.0,23
101760,26.0,20
101761,10.0,7


In [9]:
df[df['lOCode'] + df['lOComment'] != df['locCodeAndComment']][['lOCode', 'lOComment', 'locCodeAndComment']]

Unnamed: 0,lOCode,lOComment,locCodeAndComment
0,17,1,0
1,11,0,0
2,8,0,0
3,4,0,0
4,7,0,0
...,...,...,...
101758,7,0,0
101759,23,0,0
101760,20,0,0
101761,7,0,0


In [10]:
df['locCodeAndComment'].value_counts()

locCodeAndComment
0     93550
1      4221
2      1632
3       843
4       507
5       250
6       235
7       133
8       116
10       63
11       55
9        54
12       28
13       23
14       13
24        7
15        7
20        4
28        3
23        3
38        3
22        3
16        2
18        2
17        2
43        1
19        1
26        1
32        1
Name: count, dtype: int64

In [7]:
df.describe()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
count,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,...,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0
mean,50881.0,37.34716,5.492684,2.845022,3.498826,96.655995,538.280956,0.111634,13.681881,27.573007,...,1141.357982,22.802453,1.773945,3.979865,0.196604,11.896131,15.596671,57.628116,39.249698,9.839549
std,29376.592059,54.600401,7.900855,4.631262,5.534541,171.147191,1270.791601,0.100096,14.121306,22.856742,...,9862.795472,38.54101,5.902412,6.382358,0.998906,6.749549,18.064261,104.53766,71.692309,14.412769
min,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,25440.5,13.0,2.0,1.0,1.0,25.0,97.67,0.05,5.6,15.56,...,31.38,7.0,0.0,1.0,0.0,8.0,7.0,15.0,10.0,3.0
50%,50881.0,22.0,3.0,1.0,2.0,51.0,232.79,0.09,9.82,23.36,...,125.4,14.0,0.0,2.0,0.0,11.0,12.0,30.0,20.0,5.0
75%,76321.5,42.0,6.0,3.0,4.0,111.0,560.25,0.15,18.0,34.34,...,565.92,26.0,1.0,5.0,0.0,16.0,20.0,66.0,45.0,11.0
max,101762.0,3442.0,404.0,165.0,402.0,8441.0,80843.08,1.0,418.2,569.78,...,935923.39,2824.0,344.0,219.0,43.0,410.0,1026.0,5420.0,3021.0,503.0


In [8]:
df['defects'].value_counts()

defects
False    78699
True     23064
Name: count, dtype: int64

In [9]:
# normalizing column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# normalizing string values
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [11]:
df.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,locode,locomment,loblank,loccodeandcomment,uniq_op,uniq_opnd,total_op,total_opnd,branchcount,defects
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,True
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,False


In [12]:
df.dtypes

id                     int64
loc                  float64
v(g)                 float64
ev(g)                float64
iv(g)                float64
n                    float64
v                    float64
l                    float64
d                    float64
i                    float64
e                    float64
b                    float64
t                    float64
locode                 int64
locomment              int64
loblank                int64
loccodeandcomment      int64
uniq_op              float64
uniq_opnd            float64
total_op             float64
total_opnd           float64
branchcount          float64
defects                 bool
dtype: object

In [13]:
df.nunique()

id                   101763
loc                     378
v(g)                    106
ev(g)                    71
iv(g)                    84
n                       836
v                      4515
l                        55
d                      3360
i                      5171
e                      8729
b                       315
t                      8608
locode                  298
locomment                91
loblank                  94
loccodeandcomment        29
uniq_op                  70
uniq_opnd               176
total_op                623
total_opnd              485
branchcount             144
defects                   2
dtype: int64

In [16]:
df.corrwith(df.defects).sort_values(ascending=False)

defects              1.000000
loc                  0.342642
branchcount          0.322827
v(g)                 0.301187
ev(g)                0.259928
n                    0.258080
loblank              0.257819
total_opnd           0.252752
locode               0.250604
total_op             0.250533
uniq_opnd            0.246113
iv(g)                0.245618
d                    0.241936
b                    0.232594
v                    0.231179
i                    0.208577
locomment            0.205402
uniq_op              0.178474
loccodeandcomment    0.133150
t                    0.099592
e                    0.095366
id                   0.001976
l                   -0.253237
dtype: float64