# Import all the existing libraries and dataset

In [99]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_predict
from sklearn.learning_curve import validation_curve
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [7]:
raw_csv_path = "./Data_Annotations.csv"
converted_csv_path = "./Data_Annotations_Converted.csv"
raw_df = pd.read_csv(raw_csv_path)
converted_df = pd.read_csv(converted_csv_path)

In [8]:
# Print out both the converted contents
converted_df.head()

Unnamed: 0,Name of file,Indendation,Indendation Consistent?,Whitespace around code to improve readability?,Used == instead of equals,Single/Nondescriptive variable names,Comments,Empty Catch Block,Generic Exceptions,Explicit Initialization,Method Length,Magic Numbers,# of magic numbers,Line Length,Boolean Exp Complexity,"FINAL CODE QUALITY METRIC (1, Very 1, Decent, -1, Very -1)"
0,Snake.java,4,1,1,-1,2,0,-1,-1,0,6,0,0,25,-1,-1
1,TimeTest.java,2,1,0,-1,6,0,-1,-1,1,9,0,0,70,-1,-1
2,HelloWorld.java,9,0,0,-1,1,1,-1,-1,1,6,1,1,40,-1,1
3,NoDisconnectManager.java,2,1,1,-1,0,5,0,0,1,48,0,0,115,1,1
4,TestIntegerNull.java,2,1,1,1,0,0,-1,-1,1,9,0,0,40,1,-1


In [9]:
# Print out the raw contents
raw_df.head()

Unnamed: 0,Name of file,Indendation,Indendation Consistent?,Whitespace around code to improve readability?,Used == instead of equals,Single/Nondescriptive variable names,Comments,Empty Catch Block,Generic Exceptions,Explicit Initialization,Method Length,Magic Numbers,# of magic numbers,Line Length,Boolean Exp Complexity,"FINAL CODE QUALITY METRIC (Good, Very Good, Decent, Bad, Very Bad)"
0,Snake.java,4,Yes,Yes,,2,0,,,No,6,No,0,25,,Bad
1,TimeTest.java,2,Yes,No,,6,0,,,Yes,9,No,0,70,,Bad
2,HelloWorld.java,9,No,No,,1,1,,,Yes,6,Yes,1,40,,Good
3,NoDisconnectManager.java,2,Yes,Yes,,0,5,No,No,Yes,48,No,0,115,1.0,Good
4,TestIntegerNull.java,2,Yes,Yes,Yes,0,0,,,Yes,9,No,0,40,1.0,Bad


# Start the setup for models

In [12]:
features = converted_df.ix[:, 'Indendation':'Boolean Exp Complexity']
#features.drop('Magic Number')
features.head()

Unnamed: 0,Indendation,Indendation Consistent?,Whitespace around code to improve readability?,Used == instead of equals,Single/Nondescriptive variable names,Comments,Empty Catch Block,Generic Exceptions,Explicit Initialization,Method Length,Magic Numbers,# of magic numbers,Line Length,Boolean Exp Complexity
0,4,1,1,-1,2,0,-1,-1,0,6,0,0,25,-1
1,2,1,0,-1,6,0,-1,-1,1,9,0,0,70,-1
2,9,0,0,-1,1,1,-1,-1,1,6,1,1,40,-1
3,2,1,1,-1,0,5,0,0,1,48,0,0,115,1
4,2,1,1,1,0,0,-1,-1,1,9,0,0,40,1


In [13]:
values = converted_df['FINAL CODE QUALITY METRIC (1, Very 1, Decent, -1, Very -1)']
values.head()

0   -1
1   -1
2    1
3    1
4   -1
Name: FINAL CODE QUALITY METRIC (1, Very 1, Decent, -1, Very -1), dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, values, test_size = 0.20, random_state = 13)

# Models

### Linear Regression

In [15]:
from sklearn import linear_model

In [73]:
clf_linear = linear_model.LinearRegression()
clf_linear.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [74]:
y_pred_linear = clf_linear.predict(X_test)
y_pred_linear

array([-1.00325048,  0.36087905,  0.43948159,  0.30962588, -0.22713099,
        0.54411098, -0.36177908,  0.93353815, -0.19579198,  0.71533081,
        0.45845042, -0.46938696])

In [76]:
#Convert continous predicted values from regression to class labels based on a threshold
y_pred_threshold_linear = convert2class(y_pred_linear)

In [77]:
accuracy_score(y_test, y_pred_threshold_linear)

0.66666666666666663

In [96]:
predicted = cross_val_predict(clf_linear, features, values, cv=10)
predicted

array([-0.07024509, -0.66240162, -0.73405987,  0.67122256,  0.50141417,
       -1.11570062,  0.80022965, -0.02639324,  0.92085662, -0.76263747,
        0.63374174, -0.79726329, -1.27430266,  0.65541348,  1.03009873,
       -0.14728745, -1.79542418, -1.89299422, -0.67066715, -4.60385909,
        0.51377508,  0.91570507,  0.79260599, -0.80315225,  0.77629729,
        0.5686608 ,  0.58257165,  0.81166913,  0.83062395, -0.25101568,
        0.52919655, -0.19009244,  0.26277471, -0.15008803, -0.91955569,
       -0.86691092, -0.68640206,  0.70714745, -1.1629856 , -1.06474031,
        0.19095623,  3.54345166, -0.5295561 ,  0.37512091, -0.37892788,
        0.63162838, -0.8993991 ,  0.4187997 , -0.07363416,  0.50536297,
        0.67576266,  0.26583802,  0.28186229,  0.95610485,  0.47762473,
       -0.91924863,  0.62656617,  0.14690301])

In [97]:
#Plot the predicted values vs the target values
fig, ax = plt.subplots()
ax.scatter(values, predicted)
ax.plot([values.min(), values.max()], [values.min(), values.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [102]:
train_scores, valid_scores = validation_curve(linear_model.Ridge(), features, values, 
                                              "alpha", np.logspace(-7, 3, 3))

(array([[ 0.87028213,  0.66776531,  0.70673133],
        [ 0.87027934,  0.66776275,  0.70672869],
        [ 0.2022795 ,  0.27679266,  0.24236229]]),
 array([[-1.04398478,  0.1033688 ,  0.31269817],
        [-1.03915498,  0.10845698,  0.31125148],
        [ 0.12714606,  0.13615635,  0.11185594]]))

### RBF Kernel SVM

In [54]:
from sklearn import svm

In [164]:
#Validation Curve for model parameter space
plot_validation_curve_svm(features, values, 'rbf')

[  1.00000000e-06   3.59381366e-06   1.29154967e-05   4.64158883e-05
   1.66810054e-04   5.99484250e-04   2.15443469e-03   7.74263683e-03
   2.78255940e-02   1.00000000e-01]


In [157]:
clf_svm_rbf = svm.SVC(gamma = 7.74263683e-03)
clf_svm_rbf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.00774263683,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [158]:
y_pred_svm_rbf = clf_svm_rbf.predict(X_test)
y_pred_svm_rbf

array([-1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1, -1], dtype=int64)

In [159]:
accuracy_score(y_test, y_pred_svm_rbf)

0.5

### Poly Kernel SVM

In [168]:
clf_svm_poly = svm.SVC(gamma = 7.74263683e-03, kernel = 'poly')
clf_svm_poly.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.00774263683,
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [169]:
y_pred_svm_poly = clf_svm_poly.predict(X_test)
y_pred_svm_poly

array([-1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1], dtype=int64)

In [170]:
accuracy_score(y_test, y_pred_svm_poly)

0.75

### Random Forest

### Utility Functions

In [171]:
def plot_validation_curve_svm(X, y, kernel):
    #Return numbers spaced evenly on a log scale.
    param_range = np.logspace(-6, -1, 10)
    print(np.logspace(-6, -1, 10))
    train_scores, test_scores = validation_curve(
        svm.SVC(kernel = kernel), X, y, param_name="gamma", param_range=param_range,
        cv=10, scoring="accuracy", n_jobs=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with SVM")
    plt.xlabel("$\gamma$")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.legend(loc="best")
    plt.show()

In [172]:
#Convert continous predicted values from regression to class labels based on a threshold
def convert2class(y_pred):
    y_pred_threshold = []
    for x in y_pred:
        if x >= 0: y_pred_threshold.append(1)
        else: y_pred_threshold.append(-1)
    return y_pred_threshold