# Import all the existing libraries and dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_predict
from sklearn.learning_curve import validation_curve
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
raw_csv_path = "./Data_Annotations.csv"
converted_csv_path = "./Data_Annotations_Converted.csv"
raw_df = pd.read_csv(raw_csv_path)
converted_df = pd.read_csv(converted_csv_path)

In [3]:
# Print out both the converted contents
converted_df.head()

Unnamed: 0,Name of file,Indendation,Indendation Consistent?,Whitespace around code to improve readability?,Used == instead of equals,Single/Nondescriptive variable names,Comments,Empty Catch Block,Generic Exceptions,Explicit Initialization,Method Length,Magic Numbers,# of magic numbers,Line Length,Boolean Exp Complexity,"FINAL CODE QUALITY METRIC (1, Very 1, Decent, -1, Very -1)"
0,Snake.java,4,1,1,-1,2,0,-1,-1,0,6,0,0,25,-1,-1
1,TimeTest.java,2,1,0,-1,6,0,-1,-1,1,9,0,0,70,-1,-1
2,HelloWorld.java,9,0,0,-1,1,1,-1,-1,1,6,1,1,40,-1,1
3,NoDisconnectManager.java,2,1,1,-1,0,5,0,0,1,48,0,0,115,1,1
4,TestIntegerNull.java,2,1,1,1,0,0,-1,-1,1,9,0,0,40,1,-1


In [4]:
# Print out the raw contents
raw_df.head()

Unnamed: 0,Name of file,Indendation,Indendation Consistent?,Whitespace around code to improve readability?,Used == instead of equals,Single/Nondescriptive variable names,Comments,Empty Catch Block,Generic Exceptions,Explicit Initialization,Method Length,Magic Numbers,# of magic numbers,Line Length,Boolean Exp Complexity,"FINAL CODE QUALITY METRIC (Good, Very Good, Decent, Bad, Very Bad)"
0,Snake.java,4,Yes,Yes,,2,0,,,No,6,No,0,25,,Bad
1,TimeTest.java,2,Yes,No,,6,0,,,Yes,9,No,0,70,,Bad
2,HelloWorld.java,9,No,No,,1,1,,,Yes,6,Yes,1,40,,Good
3,NoDisconnectManager.java,2,Yes,Yes,,0,5,No,No,Yes,48,No,0,115,1.0,Good
4,TestIntegerNull.java,2,Yes,Yes,Yes,0,0,,,Yes,9,No,0,40,1.0,Bad


# Start the setup for models

In [5]:
features = converted_df.ix[:, 'Indendation':'Boolean Exp Complexity']
#features.drop('Magic Numbers')
features.head()

Unnamed: 0,Indendation,Indendation Consistent?,Whitespace around code to improve readability?,Used == instead of equals,Single/Nondescriptive variable names,Comments,Empty Catch Block,Generic Exceptions,Explicit Initialization,Method Length,Magic Numbers,# of magic numbers,Line Length,Boolean Exp Complexity
0,4,1,1,-1,2,0,-1,-1,0,6,0,0,25,-1
1,2,1,0,-1,6,0,-1,-1,1,9,0,0,70,-1
2,9,0,0,-1,1,1,-1,-1,1,6,1,1,40,-1
3,2,1,1,-1,0,5,0,0,1,48,0,0,115,1
4,2,1,1,1,0,0,-1,-1,1,9,0,0,40,1


In [6]:
values = converted_df['FINAL CODE QUALITY METRIC (1, Very 1, Decent, -1, Very -1)']
values.head()

0   -1
1   -1
2    1
3    1
4   -1
Name: FINAL CODE QUALITY METRIC (1, Very 1, Decent, -1, Very -1), dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, values, test_size = 0.20, random_state = 13)

# Models

In [8]:
#Please run Utility Functions code  prior to the following steps. 
#Utility functions are located at the end of this notebook.

### Linear Regression

In [11]:
from sklearn import linear_model

In [12]:
clf_linear = linear_model.LinearRegression()
clf_linear.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
y_pred_linear = clf_linear.predict(X_test)
y_pred_linear

array([-0.92538896,  0.59526815, -1.18657404, -0.18503957, -0.77044602,
       -1.04100951,  0.67308216,  0.6525286 , -0.65869705, -1.14015546,
        0.64417987,  0.48413438, -0.20490291, -0.7801846 , -0.59913956,
        0.94163297,  0.15619541,  0.79016774, -0.20984944,  0.36702245,
       -0.52570253, -0.23214241])

In [14]:
#Convert continous predicted values from regression to class labels based on a threshold
y_pred_threshold_linear = convert2class(y_pred_linear)

In [15]:
accuracy_score(y_test, y_pred_threshold_linear)

0.90909090909090906

### RBF Kernel SVM

In [16]:
from sklearn import svm

In [17]:
#Validation Curve for model parameter space
plot_validation_curve_svm(features, values, 'rbf')

[  1.00000000e-06   3.59381366e-06   1.29154967e-05   4.64158883e-05
   1.66810054e-04   5.99484250e-04   2.15443469e-03   7.74263683e-03
   2.78255940e-02   1.00000000e-01]


  if self._edgecolors == str('face'):


In [18]:
clf_svm_rbf = svm.SVC(gamma = 7.74263683e-03)
clf_svm_rbf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.00774263683,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [19]:
y_pred_svm_rbf = clf_svm_rbf.predict(X_test)
y_pred_svm_rbf

array([ 1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1,  1,
        1,  1, -1,  1,  1], dtype=int64)

In [178]:
accuracy_score(y_test, y_pred_svm_rbf)

0.5

### Poly Kernel SVM

In [20]:
#Validation Curve for model parameter space
plot_validation_curve_svm(features, values, 'poly')

[  1.00000000e-06   3.59381366e-06   1.29154967e-05   4.64158883e-05
   1.66810054e-04   5.99484250e-04   2.15443469e-03   7.74263683e-03
   2.78255940e-02   1.00000000e-01]


In [21]:
clf_svm_poly = svm.SVC(gamma = 7.74263683e-03, kernel = 'poly')
clf_svm_poly.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.00774263683,
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [22]:
y_pred_svm_poly = clf_svm_poly.predict(X_test)
y_pred_svm_poly

array([-1,  1, -1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1,
        1, -1,  1, -1,  1], dtype=int64)

In [23]:
accuracy_score(y_test, y_pred_svm_poly)

0.77272727272727271

### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
clf_random_forest = RandomForestClassifier(warm_start=True, 
                                           oob_score=True,
                                           max_features="sqrt",
                                           n_estimators=100,
                                           random_state=13)
clf_random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=13, verbose=0, warm_start=True)

In [26]:
y_pred_random_forest = clf_random_forest.predict(X_test)

In [27]:
accuracy_score(y_test, y_pred_random_forest)

0.90909090909090906

### AdaBoost

In [28]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
plot_validation_curve_estimators(X_train, y_train, X_test, y_test, 100, 1000, 50)

In [44]:
clf_ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             algorithm="SAMME",
                             n_estimators=600)
clf_ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=600, random_state=None)

In [45]:
y_pred_ada = clf_ada.predict(X_test)

In [46]:
accuracy_score(y_test, y_pred_ada)

0.95454545454545459

### Utility Functions

In [9]:
#Plot the cv and training scores
def plot_validation_curve_svm(X, y, kernel):
    #Return numbers spaced evenly on a log scale.
    param_range = np.logspace(-6, -1, 10)
    print(np.logspace(-6, -1, 10))
    train_scores, test_scores = validation_curve(
        svm.SVC(kernel = kernel), X, y, param_name="gamma", param_range=param_range,
        cv=10, scoring="accuracy", n_jobs=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with SVM")
    plt.xlabel("$\gamma$")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.legend(loc="best")
    plt.show()

In [10]:
#Convert continous predicted values from regression to class labels based on a threshold
def convert2class(y_pred):
    y_pred_threshold = []
    for x in y_pred:
        if x >= 0: y_pred_threshold.append(1)
        else: y_pred_threshold.append(-1)
    return y_pred_threshold

In [54]:
def plot_validation_curve_estimators(X_train, y_train, X_test, y_test, n_estimators_min, 
                                     n_estimators_max, increment_size):
    #List of accuracies for each estimator size
    acc, est = [], []
    for estimators in range(n_estimators_min, n_estimators_max, increment_size):
        clf_ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                                 algorithm="SAMME",
                                 n_estimators=estimators)
        clf_ada.fit(X_train, y_train)

        y_pred_ada = clf_ada.predict(X_test)

        acc.append(accuracy_score(y_test, y_pred_ada))
        est.append(estimators)
    #Graph accuracy vs estimators
    plt.title("Parameter Validation Curve for # Estimators")
    plt.xlabel("# estimators")
    plt.ylabel("Accuracy")
    plt.plot(est, acc)
    plt.show()
    print(acc)