In [2]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import KMeansSMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report, make_scorer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

Drop unnecessary columns in the dataset.

In [21]:
from sklearn.svm import SVC, LinearSVC
def clean_dataframe(df):

    df.drop(columns=['name', 'followers', 'commit_count_a', 'source', 'job', 'name_without_spaces',
                     'project', 'index'], inplace=True)

    df.drop(columns=['AddSM','DelSM','ChurnSM',"SumAddDelSM"], inplace=True)
    
    for column in  df.columns:
        if "SumAddDel" in column:
            df.drop(columns=[column], inplace=True)


    df['DiP'] = df['DiP'].round()
    df['DiP'].replace(0, 1, inplace=True)


Log columns in the dataset to reduce the skewness in data.

In [22]:
def log_dataframe(df):

    columns_4_log = ['SumAddDelLOC', 'DiP', 'NoC', 'SumAddDelF',
                     'SumAddDelSAM', 'AddLOC', 'DelLOC', 'AddSAM', 'DelSAM']


    for column in columns_4_log:
        df[column] = np.log(df[column] + 1)

Returns labels (y)

In [5]:
def get_labels(df):
    df.loc[df['job'] == "SA", 'job'] = "SSE"
    df.loc[df['job'] != "SSE", 'job'] = "NSSE"

    return df["job"]

Scales data according to the scaler given as input

In [6]:
def scaling(scaler, X):
    return scaler.fit_transform(X)

Train the classifier with synthetic data an create a classification report on original data

In [7]:
def train_and_classification_report(classifier, X_synthetic, y_synthetic, X_scaled, y):
    classifier.fit(X_synthetic, y_synthetic)
    print(classification_report(y, classifier.predict(X_scaled)))

Scores the classifier using a k-fold (here a Stratified 4-fold with shuffle)
Synthetic data are created to train the classifier for each fold.
Real data are used to compute measures for the classifier.

In [8]:
def grid_search_logistic_regression(X_scaled, y, smote, kf):
    pipe = Pipeline([
        ('sampling', smote),
        ('classif', LogisticRegression(max_iter=2000, solver='sag')),
    ])
    custom_scorer = make_scorer(f1_score, greater_is_better=True, pos_label="SSE")

    search = GridSearchCV(
            estimator=pipe,
            param_grid={
                'classif__tol': [1e-4, 1e-3, 1e-2, 1e-1],
                'classif__C': [x for x in np.arange(0.05, 1, 0.05)],
                'classif__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                'classif__random_state' : [0,42,9090]
            }, cv=kf, scoring=custom_scorer, n_jobs=-1,refit=True)

    search.fit(X_scaled, y)
    print("Best CV score=%0.4f:" % search.best_score_)
    print("Best CI 95=%0.4f" % (2*search.cv_results_['std_test_score'][search.best_index_]))
    print(search.best_params_)

In [9]:
def grid_search_svc(X_scaled, y, smote, kf):
    pipe = Pipeline([
        ('sampling', smote),
        ('classif', SVC()),
    ])
    custom_scorer = make_scorer(f1_score, greater_is_better=True, pos_label="SSE")

    search = GridSearchCV(
        estimator=pipe,
        param_grid={
            'classif__tol': [1e-4, 1e-3, 1e-2, 1e-5, 1e-1],
            'classif__C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
            'classif__gamma': ['scale', 'auto'],
            'classif__kernel' : ['linear','sigmoid','rbf','poly'],
            'classif__random_state' : [0,42,9090]
        }, cv=kf, scoring=custom_scorer, n_jobs=-1)
    search.fit(X_scaled, y)
    print("Best CV score=%0.4f:" % search.best_score_)
    print("Best CI 95=%0.4f" % (2*search.cv_results_['std_test_score'][search.best_index_]))
    print(search.best_params_)

In [10]:
def grid_search_linear_MLP(X_scaled, y, smote, kf):
    pipe = Pipeline([
        ('sampling', smote),
        ('classif', MLPClassifier(max_iter=2000)),
    ])
    custom_scorer = make_scorer(f1_score, greater_is_better=True, pos_label="SSE")

    search = GridSearchCV(
        estimator=pipe,
        param_grid={
            'classif__activation': ['identity', 'logistic', 'tanh', 'relu'],
            'classif__solver':['lbfgs','sgd','adam'],
            'classif__learning_rate': ['constant', 'invscaling', 'adaptive'],
             'classif__hidden_layer_sizes' : [(100,),(50,50,),(10,30,10)],
            'classif__random_state' : [0]
        }, cv=kf, scoring=custom_scorer, n_jobs=-1)
    search.fit(X_scaled, y)
    print("Best CV score=%0.4f:" % search.best_score_)
    print("Best CI 95=%0.4f" % (2*search.cv_results_['std_test_score'][search.best_index_]))
    print(search.best_params_)

In [15]:
def grid_search_random_forest(X_scaled, y, smote, kf):
    pipe = Pipeline([
        ('sampling', smote),
        ('classif', RandomForestClassifier()),
    ])
    custom_scorer = make_scorer(f1_score, greater_is_better=True, pos_label="SSE")

    search = GridSearchCV(
        estimator=pipe,
        param_grid={
            'classif__criterion': ["gini","entropy"],
            'classif__max_features':["auto","sqrt","log2"],
            'classif__n_estimators':[50, 75 ,100, 200, 300],
            'classif__random_state' : [0,42]
        }, cv=kf, scoring=custom_scorer, n_jobs=-1)
    search.fit(X_scaled, y)
    print("Best CV score=%0.4f:" % search.best_score_)
    print("Best CI 95=%0.4f" % (2*search.cv_results_['std_test_score'][search.best_index_]))
    print(search.best_params_)

In [16]:
def grid_search_knn(X_scaled, y, smote, kf):
    pipe = Pipeline([
        ('sampling', smote),
        ('classif', KNeighborsClassifier()),
    ])
    custom_scorer = make_scorer(f1_score, greater_is_better=True, pos_label="SSE")

    search = GridSearchCV(
        estimator=pipe,
        param_grid={
            'classif__weights': ["uniform","distance"],
            'classif__algorithm':["ball_tree","kb_tree","brute"],
            'classif__p':[1, 2],
            'classif__n_neighbors': [4,5,6],
        }, cv=kf, scoring=custom_scorer, n_jobs=-1)
    search.fit(X_scaled, y)
    print("Best CV score=%0.4f:" % search.best_score_)
    print("Best CI 95=%0.4f" % (2*search.cv_results_['std_test_score'][search.best_index_]))
    print(search.best_params_)

In [17]:
def grid_search_sgd(X_scaled, y, smote, kf):
    pipe = Pipeline([
        ('sampling', smote),
        ('classif', SGDClassifier(max_iter=2000, random_state=0, eta0=1)),
    ])
    custom_scorer = make_scorer(f1_score, greater_is_better=True, pos_label="SSE")

    search = GridSearchCV(
        estimator=pipe,
        param_grid={
            'classif__loss': ["hinge","log","modified_huber","squared_hinge"],
            'classif__learning_rate':["constant","optimal","invscaling","adaptive"],
            'classif__alpha':[0.0001, 0.001, 0.01, 0.1],
            'classif__tol': [1e-4,1e-3,1e-2,1e-1],
        }, cv=kf, scoring=custom_scorer, n_jobs=-1)
    search.fit(X_scaled, y)
    print("Best CV score=%0.4f:" % search.best_score_)
    print("Best CI 95=%0.4f" % (2*search.cv_results_['std_test_score'][search.best_index_]))
    print(search.best_params_)

In [23]:
df = pd.read_csv("merged_2_annotated.csv")
y = get_labels(df)

log_dataframe(df)
clean_dataframe(df)
X = df

scaler = MinMaxScaler(feature_range=(-1,1))
kf = StratifiedKFold(n_splits=4)
smote = KMeansSMOTE(sampling_strategy='minority', n_jobs=-1, random_state=9090)

X_scaled = scaling(scaler, X)

In [24]:
grid_search_logistic_regression(X_scaled, y, smote, kf)

Best CV score=0.7638:
Best CI 95=0.0645
{'classif__C': 0.9000000000000001, 'classif__random_state': 9090, 'classif__solver': 'sag', 'classif__tol': 0.1}


In [25]:
grid_search_svc(X_scaled, y, smote, kf)

Best CV score=0.7703:
Best CI 95=0.0921
{'classif__C': 0.7, 'classif__gamma': 'scale', 'classif__kernel': 'poly', 'classif__random_state': 0, 'classif__tol': 0.0001}


In [26]:
grid_search_linear_MLP(X_scaled, y, smote, kf)

Best CV score=0.7717:
Best CI 95=0.0426
{'classif__activation': 'tanh', 'classif__hidden_layer_sizes': (50, 50), 'classif__learning_rate': 'constant', 'classif__random_state': 0, 'classif__solver': 'adam'}


In [27]:
grid_search_random_forest(X_scaled, y, smote, kf)

Best CV score=0.7887:
Best CI 95=0.0529
{'classif__criterion': 'gini', 'classif__max_features': 'auto', 'classif__n_estimators': 75, 'classif__random_state': 0}


In [16]:
grid_search_knn(X_scaled, y, smote, kf)

 0.75608155 0.75608155 0.75927198 0.76546501 0.75443921 0.76726308
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.76339286 0.75876635 0.76292517 0.76273551 0.75709486 0.75280193
 0.75608155 0.75608155 0.75927198 0.76546501 0.75443921 0.76726308]


Best CV score=0.7673:
Best CI 95=0.0726
{'classif__algorithm': 'ball_tree', 'classif__n_neighbors': 6, 'classif__p': 2, 'classif__weights': 'distance'}


In [19]:
grid_search_sgd(X_scaled, y, smote, kf)

Best CV score=0.7749:
Best CI 95=0.0573
{'classif__alpha': 0.1, 'classif__learning_rate': 'invscaling', 'classif__loss': 'modified_huber', 'classif__tol': 0.1}
