# Loading data

In [1]:
import pandas as pd

## Loading metrics

In [2]:
androidListSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\androidSamples.csv")
awsListSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\awsSamples.csv")
azureListSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\azureSamples.csv")
springListSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\springSamples.csv")

In [3]:
len(androidListSamples)

84

In [4]:
len(awsListSamples)

8

In [5]:
len(azureListSamples)

31

In [6]:
len(springListSamples)

36

In [7]:
len(androidListSamples) + len(awsListSamples) + len(azureListSamples) + len(springListSamples)

159

### 159 Code Samples

In [8]:
def dealWithData(listSamples):    
    listSamples["windowsPath"] = listSamples["path"].str.replace("/", "\\")
    
    allSamples = [pd.read_csv("..\\2-ExtractingMetrics\\metrics\\"+sample+".csv", parse_dates=True) for sample in listSamples["windowsPath"]]

    allSamplesMetrics = pd.concat(allSamples, axis=0, ignore_index=True)

    del allSamplesMetrics["Kind"]
    del allSamplesMetrics["Name"]

    allSamplesMetrics['commitDate'] = pd.to_datetime(allSamplesMetrics['commitDate'].astype(str).str[:-6])

    allSamplesMetrics.sort_values(by="commitDate", inplace=True)
    
    print(allSamplesMetrics.shape)
        
    return allSamplesMetrics

In [9]:
androidSamples = dealWithData(androidListSamples)
awsSamples = dealWithData(awsListSamples)
azureSamples = dealWithData(azureListSamples)
springSamples = dealWithData(springListSamples)

(4710, 64)
(241, 64)
(933, 64)
(6236, 64)


In [10]:
androidSamples.shape[0] + awsSamples.shape[0] +  azureSamples.shape[0] +  springSamples.shape[0]

12120

Commits

In [11]:
allMetrics = pd.concat([androidSamples, awsSamples, azureSamples, springSamples], axis=0)
allMetrics.shape

(12120, 64)

In [12]:
stats = pd.DataFrame(index = allMetrics.min().index)

In [13]:
del allMetrics["CountDeclExecutableUnit"]
del allMetrics["CountDeclFile"]
del allMetrics["CountDeclFunction"]
del allMetrics["CountInput"]
del allMetrics["CountOutput"]
del allMetrics["CountPath"]
del allMetrics["CountPathLog"]
del allMetrics["Cyclomatic"]
del allMetrics["CyclomaticModified"]
del allMetrics["CyclomaticStrict"]
del allMetrics["Essential"]
del allMetrics["Knots"]
del allMetrics["MinEssentialKnots"]
del allMetrics["MaxEssentialKnots"]
del allMetrics["CountDeclClass"]

Total loc for all commit

In [14]:
allMetrics["CountLineCode"].sum()

16997958.0

### Min

In [15]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(allMetrics.min())
stats["min"] = allMetrics.min()

AvgCyclomatic                                                           1
AvgCyclomaticModified                                                   1
AvgCyclomaticStrict                                                     1
AvgEssential                                                            1
AvgLine                                                                 3
AvgLineBlank                                                            0
AvgLineCode                                                             3
AvgLineComment                                                          0
CountClassBase                                                          1
CountClassCoupled                                                       1
CountClassCoupledModified                                               0
CountClassDerived                                                       0
CountDeclClassMethod                                                    0
CountDeclClassVariable                

### Max

In [16]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(allMetrics.max())
stats["max"] = allMetrics.max()

AvgCyclomatic                                                         217
AvgCyclomaticModified                                                 199
AvgCyclomaticStrict                                                   232
AvgEssential                                                          141
AvgLine                                                              1400
AvgLineBlank                                                          253
AvgLineCode                                                          1240
AvgLineComment                                                        198
CountClassBase                                                        338
CountClassCoupled                                                    1422
CountClassCoupledModified                                             757
CountClassDerived                                                       8
CountDeclClassMethod                                                  186
CountDeclClassVariable                

### Average

In [17]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(allMetrics.mean())
stats["avg"] = allMetrics.mean()

AvgCyclomatic                      22.845050
AvgCyclomaticModified              21.332673
AvgCyclomaticStrict                24.251815
AvgEssential                       15.640017
AvgLine                           168.752145
AvgLineBlank                        7.202723
AvgLineCode                       136.035479
AvgLineComment                     20.149010
CountClassBase                     20.009076
CountClassCoupled                  88.217409
CountClassCoupledModified          28.011634
CountClassDerived                   0.168977
CountDeclClassMethod               13.060809
CountDeclClassVariable             38.460479
CountDeclInstanceMethod            83.384901
CountDeclInstanceVariable          36.791172
CountDeclMethod                    96.445710
CountDeclMethodAll                298.328465
CountDeclMethodDefault              1.308581
CountDeclMethodPrivate             28.427805
CountDeclMethodProtected            8.604868
CountDeclMethodPublic              58.104455
CountLine 

### Median

In [18]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(allMetrics.median())
stats["med"] = allMetrics.median()

AvgCyclomatic                      6.000000
AvgCyclomaticModified              6.000000
AvgCyclomaticStrict                6.000000
AvgEssential                       5.000000
AvgLine                           39.000000
AvgLineBlank                       3.000000
AvgLineCode                       32.000000
AvgLineComment                     3.000000
CountClassBase                     5.000000
CountClassCoupled                 18.000000
CountClassCoupledModified          3.000000
CountClassDerived                  0.000000
CountDeclClassMethod               2.000000
CountDeclClassVariable             1.000000
CountDeclInstanceMethod           11.000000
CountDeclInstanceVariable          5.000000
CountDeclMethod                   14.000000
CountDeclMethodAll                71.000000
CountDeclMethodDefault             0.000000
CountDeclMethodPrivate             0.000000
CountDeclMethodProtected           0.000000
CountDeclMethodPublic             12.000000
CountLine                       

### Standard Deviation

In [19]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(allMetrics.std())
stats["std"] = allMetrics.std()

AvgCyclomatic                      41.499733
AvgCyclomaticModified              37.754107
AvgCyclomaticStrict                44.321338
AvgEssential                       26.072260
AvgLine                           280.875646
AvgLineBlank                       13.535609
AvgLineCode                       241.147237
AvgLineComment                     29.777397
CountClassBase                     34.300814
CountClassCoupled                 177.222195
CountClassCoupledModified          64.909594
CountClassDerived                   0.764223
CountDeclClassMethod               25.575644
CountDeclClassVariable             91.787264
CountDeclInstanceMethod           178.094757
CountDeclInstanceVariable          78.967557
CountDeclMethod                   199.088959
CountDeclMethodAll                549.421654
CountDeclMethodDefault              6.151979
CountDeclMethodPrivate             73.762996
CountDeclMethodProtected           19.332048
CountDeclMethodPublic             109.780462
CountLine 

In [20]:
stats.to_csv("stats.csv")

In [21]:
stats

Unnamed: 0,min,max,avg,med,std
AvgCyclomatic,1,217,22.845050,6.000000,41.499733
AvgCyclomaticModified,1,199,21.332673,6.000000,37.754107
AvgCyclomaticStrict,1,232,24.251815,6.000000,44.321338
AvgEssential,1,141,15.640017,5.000000,26.072260
AvgLine,3,1400,168.752145,39.000000,280.875646
...,...,...,...,...,...
SumEssential,1,1523,117.062789,15.000000,262.069642
numberJavaFiles,1,343,16.911386,5.000000,27.813916
commitSha,000174e5c9e05d119930b2a13db5aefb837209ea,fffc58dc637c031b38ea441e4df33156fc6afb97,,,
commitDate,2013-04-15 08:50:04,2020-09-30 10:59:04,,,


Questions

In [22]:
allMetrics.set_index(allMetrics["commitDate"], inplace=True)

allMetrics.index

del allMetrics["commitDate"]

In [23]:
allMetrics = allMetrics.groupby([(allMetrics.index.year), (allMetrics.index.month)]).mean()

## Loading SO questions

In [24]:
androidQuestions = pd.read_csv("..\\1-GettingQuestions\\questions\\androidQuestions.csv")
awsQuestions = pd.read_csv("..\\1-GettingQuestions\\questions\\awsQuestions.csv")
azureQuestions = pd.read_csv("..\\1-GettingQuestions\\questions\\azureQuestions.csv")
springQuestions = pd.read_csv("..\\1-GettingQuestions\\questions\\springQuestions.csv")

In [25]:
len(androidQuestions)

1083

In [26]:
len(awsQuestions)

14

In [27]:
len(azureQuestions)

64

In [28]:
len(springQuestions)

136

In [29]:
allQuestions = pd.concat([androidQuestions, awsQuestions, azureQuestions, springQuestions], axis=0, ignore_index=True)

In [30]:
len(allQuestions)

1297

In [16]:
allQuestions['creationDate'] = pd.to_datetime(allQuestions['creationDate'])
allQuestions.sort_values(by="creationDate", inplace=True)
allQuestions.set_index(allQuestions['creationDate'], inplace=True)
allQuestions["questions"] = 1
allQuestions=allQuestions.groupby([(allQuestions.index.year), (allQuestions.index.month)]).sum()

In [17]:
del allQuestions["ownerUserId"]
del allQuestions["postTypeId"]
del allQuestions["acceptedAnswerId"]
del allQuestions["id"]

## Merging the data

In [18]:
metricsAndQuestions = pd.concat([allMetrics, allQuestions], axis=1)
metricsAndQuestions["questions"].fillna(0, inplace=True)
metricsAndQuestions.fillna(method="ffill", inplace=True)

## Saving

In [19]:
metricsAndQuestions.to_csv("metricsAndQuestions.csv")

In [54]:
pd.read_csv("metricsAndQuestions.csv", index_col=[0,1])

Unnamed: 0,Unnamed: 1,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,CountClassBase,CountClassCoupled,...,PercentLackOfCohesion,PercentLackOfCohesionModified,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential,numberJavaFiles,readability,questions
2013,4,5.657895,5.657895,5.657895,4.026316,22.763158,1.236842,21.315789,0.289474,5.368421,16.026316,...,11.289474,11.289474,0.153421,9.078947,9.078947,9.078947,7.394737,5.368421,0.315649,0.0
2013,5,3.824427,3.824427,3.824427,3.267176,24.717557,2.374046,21.389313,0.969466,3.801527,11.145038,...,26.885496,26.885496,0.173206,7.267176,7.267176,7.267176,6.213740,3.740458,0.252481,0.0
2013,6,3.352273,3.352273,3.352273,3.011364,19.227273,1.693182,17.079545,0.477273,3.125000,9.147727,...,24.272727,24.272727,0.063864,6.784091,6.784091,6.784091,5.965909,3.068182,0.288942,0.0
2013,7,3.005089,3.005089,3.061069,2.783715,16.763359,1.170483,15.129771,0.358779,2.885496,9.569975,...,36.842239,36.842239,0.051552,7.254453,7.254453,7.338422,6.234097,3.027990,0.269503,0.0
2013,8,3.187586,3.187586,3.281379,2.924138,17.427586,1.297931,15.713103,0.376552,3.030345,9.612414,...,37.805517,37.722759,0.050717,7.296552,7.296552,7.437241,6.281379,3.295172,0.289438,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020,5,44.128205,41.102564,46.897436,28.897436,303.384615,11.871795,255.538462,25.974359,34.179487,193.384615,...,833.846154,681.538462,4.148205,429.076923,375.794872,457.666667,257.076923,41.461538,0.295245,8.0
2020,6,117.100000,107.666667,125.233333,76.833333,784.166667,20.900000,676.733333,56.300000,95.000000,516.200000,...,2539.433333,2045.533333,11.254667,1313.500000,1142.433333,1400.800000,777.733333,78.500000,0.047969,4.0
2020,7,50.307692,46.576923,53.730769,34.346154,345.961538,12.000000,294.576923,26.884615,41.423077,216.500000,...,1071.923077,879.500000,5.023077,529.346154,462.923077,564.076923,318.961538,35.846154,0.188474,8.0
2020,8,15.264151,14.547170,16.094340,11.622642,107.943396,5.641509,91.452830,7.754717,13.830189,58.094340,...,310.566038,254.188679,1.428491,132.377358,119.433962,139.698113,84.471698,13.207547,0.297849,8.0
