In [None]:
import pandas as pd 
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.metrics import geometric_mean_score
from imblearn.metrics import specificity_score
from sklearn.model_selection import GridSearchCV

In [None]:
#data path value. EDIT it to your data path
data_path = './validation_data'
#edit these variables to include your project and your features 
projects = {"ambros" : ["mylyn","pde"],"eclipse" : ["eclipse"], "ck" : ["camel","poi","prop","synapse","xalan","xerces","lucene"]}
projects_features = {"ambros" : ["numberOfVersionsUntil:","numberOfFixesUntil:","numberOfRefactoringsUntil:","numberOfAuthorsUntil:","linesAddedUntil:","maxLinesAddedUntil:","avgLinesAddedUntil:","linesRemovedUntil:","maxLinesRemovedUntil:","avgLinesRemovedUntil:","codeChurnUntil:","maxCodeChurnUntil:","avgCodeChurnUntil:","ageWithRespectTo:","weightedAgeWithRespectTo:"],
            "ck" : ["wmc","dit","noc","cbo","rfc","lcom","ca","ce","npm","lcom3","loc","dam","moa","mfa","cam","ic","cbm","amc","max_cc","avg_cc"],
            "eclipse" : ["pre","ACD","FOUT_avg","FOUT_max","FOUT_sum","MLOC_avg","MLOC_max","MLOC_sum","NBD_avg","NBD_max","NBD_sum","NOF_avg","NOF_max","NOF_sum","NOI","NOM_avg","NOM_max","NOM_sum","NOT","NSF_avg","NSF_max","NSF_sum","NSM_avg","NSM_max","NSM_sum","PAR_avg","PAR_max","PAR_sum","TLOC","VG_avg","VG_max","VG_sum"]
            }
outcome =  {"ck" : "bug","ambros" : "bugs","eclipse" : "post"}  
#Edit these variables to use SMOTE or parameter tunning
apply_smote = True 
apply_parameter_tunning = True

In [None]:
#models and their grid
models = {"DT" : DecisionTreeClassifier(),
          
        
          "NB" :GaussianNB(),
         
          "LR" :  LogisticRegression(max_iter=10000),
          "SVM" :  SVC(),
          "RF" :  RandomForestClassifier()
          }
tune_params = {
    'DT' : {'ccp_alpha' : [0.0001,0.001,0.01,0.1,0.5],'max_depth' : [5,7,10,15]}, 
    'RF' : {'n_estimators' : [10,30,50,80,100,150,200,250,300,350,400,450,500],'max_depth' : [5,7,10,15]},
    'SVM' : {'gamma' : [0.1,0.3,0.5,0.7,0.9],'C' : [0.25,0.5,1,2,4]}
}

In [None]:
df_results = pd.DataFrame(columns = ["algorithm","model_id","file_id","train_or_test","project_name","f1","tpr","tnr","G","precision"])

In [None]:
#main loop
for i,file_name in enumerate(os.listdir(data_path)) : 
  if "train" in file_name : 
    print("working on : ",file_name)
    models = {"DT" : DecisionTreeClassifier(),
          
        
          "NB" :GaussianNB(),
         
          "LR" :  LogisticRegression(max_iter=10000),
          "SVM" :  SVC(),
          "RF" :  RandomForestClassifier()
          }

    row = {} 
    train_data = pd.read_csv(os.path.join(data_path,file_name))
    test_data = pd.read_csv(os.path.join(data_path,file_name.replace("train","test")))
    project_name = file_name.replace(".csv","").split("_")[0]
    project_id = ""
    row["file_id"] = file_name
    row["projec_name"] = project_name
    row["model_id"] = "best_model_performance"
    for project in projects : 
      for pnames in projects[project] : 
        if pnames in project_name :
          project_id = project 
          break
    features = projects_features[project_id]
    output_variable = outcome[project_id]
    for j,model_id in enumerate(models) : 
      print(model_id)
      
      if apply_smote : 
          sm = SMOTE(random_state=i*(j+1))
          X_res, y_res = sm.fit_resample(train_data.loc[:,features].values, train_data.loc[:,output_variable].values)
      else : 
          X_res = train_data.loc[:,features].values 
          y_res = train_data.loc[:,output_variable].values 
      row["algorithm"]= model_id
      model = models[model_id] 
      if apply_parameter_tunning : 
          if model_id in tune_params : 
              #print('pm is applied')
              rf_random = GridSearchCV(estimator = model, param_grid = tune_params[model_id], cv = 3, verbose=2, n_jobs = -1)
              rf_random.fit(X_res,y_res)
              model = rf_random.best_estimator_
      model.fit(X_res,y_res)
      
      y_train_predict = model.predict(train_data[features])
      y_test_predict = model.predict(test_data[features])

      row["train_or_test"] = "train"

      row["f1"]= f1_score(train_data[output_variable],y_train_predict)
      row["precision"]= precision_score(train_data[output_variable],y_train_predict)
      row["tpr"]=recall_score(train_data[output_variable],y_train_predict)
      row["G"]=geometric_mean_score(train_data[output_variable],y_train_predict)
      row["tnr"]=specificity_score(train_data[output_variable],y_train_predict)

      df_results = df_results.append(row,ignore_index=True)      
      row["train_or_test"] = "test"

      row["f1"]= f1_score(test_data[output_variable],y_test_predict)
      row["precision"]= precision_score(test_data[output_variable],y_test_predict)
      row["tpr"]=recall_score(test_data[output_variable],y_test_predict)
      row["G"]=geometric_mean_score(test_data[output_variable],y_test_predict)
      row["tnr"]=specificity_score(test_data[output_variable],y_test_predict)

      df_results = df_results.append(row,ignore_index=True)   

df_results.to_excel( "results.xlsx",index=False)
      
    
