In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('CSDataAsCSV.csv', usecols=[i for i in range(6, 18) if i != 16])
df.head()

In [None]:
X = df.drop('criticality_score', axis=1)
y = df['criticality_score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=101)

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
print(
  'mean_squared_error : ', mean_squared_error(y_test, predictions))
print(
  'mean_absolute_error : ', mean_absolute_error(y_test, predictions))

In [None]:
#Model slopes
model.coef_

In [None]:
#Model intercept
model.intercept_

In [None]:
#Test y squared
print(r2_score(y_test, predictions))

In [None]:
#All predictions
plt.ylim(0,1)
plt.scatter(range(len(y_test)), y_test, color='blue', label = "Actual Score")
plt.scatter(range(len(predictions)), predictions, color='red', label = "Predicted Score")
plt.xlabel("GitHub Project")
plt.ylabel("Criticality Score")
plt.legend(loc="upper left")
plt.title("Linear Regression Prediction Accuracy All Test Data")
plt.show()

In [None]:
#First 50 predictions compared
plt.ylim(0,1)
plt.xlim(0,50)
plt.scatter(range(len(y_test)), y_test, color='blue', label = "Actual Score")
plt.scatter(range(len(predictions)), predictions, color='red', label = "Predicted Score")
plt.xlabel("GitHub Project")
plt.ylabel("Criticality Score")
plt.title("Linear Regression Prediction Accuracy")
plt.legend(loc="upper left")
plt.show()

In [None]:
#Regression line on each figure
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
axes[0].scatter(X['contributor_count'], y)
axes[1].scatter(X['github_mention_count'], y)
xVAL = np.linspace(0, 5000, 100000)
yVAL = xVAL*(3.05671967e-05) + 0.5197769075115254
xVAL2 = np.linspace(0, 500000, 1000000)
yVAL2 = xVAL2*(9.42840512e-10) + 0.6197769075115254
#7
axes[0].plot(xVAL,yVAL, color = "red")
axes[1].plot(xVAL2,yVAL2, color = "red")
axes[0].set_title("Contributor Count Data")
axes[1].set_title("Dependents Count Data")
axes[0].set_xlabel("Contributor Count")
axes[0].set_ylabel("Criticality Score")
axes[1].set_xlabel("Dependent Count")
axes[1].set_ylabel("Criticality Score")
axes[1].set_xlim([0,500000])
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
axes[0].scatter(X['created_since'], y)
axes[1].scatter(X['updated_since'], y)
xVAL = np.linspace(0, 700, 10000)
yVAL = xVAL*(1.28094045e-04) + 0.4197769075115254
xVAL2 = np.linspace(0, 4000, 10000)
yVAL2 = xVAL2*(-2.58763120e-04) + 0.7197769075115254
axes[0].plot(xVAL,yVAL, color = "red")
axes[1].plot(xVAL2,yVAL2, color = "red")
axes[0].set_title("Created Since Data")
axes[1].set_title("Updated Since Data")
axes[0].set_xlabel("Months Created Since")
axes[0].set_ylabel("Criticality Score")
axes[1].set_xlabel("Months Updated Since")
axes[1].set_ylabel("Criticality Score")
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
axes[0].scatter(X['org_count'], y)
axes[1].scatter(X['commit_frequency'], y)
xVAL = np.linspace(0, 16, 100)
yVAL = xVAL*(3.10799919e-02) + 0.2197769075115254
xVAL2 = np.linspace(0, 6000, 10000)
yVAL2 = xVAL2*(1.71704155e-04) + 0.197769075115254
axes[0].plot(xVAL,yVAL, color = "red")
axes[1].plot(xVAL2,yVAL2, color = "red")
axes[0].set_title("Organization Count Data")
axes[1].set_title("Commit Frequency Data")
axes[0].set_xlabel("Number of Distinct Orgs")
axes[0].set_ylabel("Criticality Score")
axes[1].set_xlabel("Commitment Frequency of Project")
axes[1].set_ylabel("Criticality Score")
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
axes[0].scatter(X['recent_release_count'], y)
axes[1].scatter(X['issue_comment_frequency'], y)
xVAL = np.linspace(0, 20000, 10000)
yVAL = xVAL*(1.18442692e-04) + 0.1197769075115254
xVAL2 = np.linspace(0, 300, 10000)
yVAL2 = xVAL2*( 1.81029279e-02) + 0.197769075115254
axes[0].plot(xVAL,yVAL, color = "red")
axes[1].plot(xVAL2,yVAL2, color = "red")
axes[0].set_title("Recent Release Count Data")
axes[1].set_title("Issue Comment Frequency Data")
axes[0].set_xlabel("Number of Recent Releases (Last year)")
axes[0].set_ylabel("Criticality Score")
axes[1].set_xlabel("Issue Comment Frequency (Avg. in last 90 days)")
axes[1].set_ylabel("Criticality Score")
axes[0].set_ylim([0,1])
axes[1].set_ylim([0,1])
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
axes[0].scatter(X['updated_issues_count'], y)
axes[1].scatter(X['closed_issues_count'], y)
xVAL = np.linspace(0, 75000, 100000)
yVAL = xVAL*(9.65094170e-05) + 0.2197769075115254
xVAL2 = np.linspace(0, 75000, 100000)
yVAL2 = xVAL2*(-8.29989346e-05) + 0.797769075115254
axes[0].plot(xVAL,yVAL, color = "red")
axes[1].plot(xVAL2,yVAL2, color = "red")
axes[0].set_title("Updated Issues Data")
axes[1].set_title("Closed Issues Data")
axes[0].set_xlabel("Number of Updated Issues (Past 90 days)")
axes[0].set_ylabel("Criticality Score")
axes[1].set_xlabel("Number of Closed Issues (Past 90 days)")
axes[1].set_ylabel("Criticality Score")
axes[0].set_ylim([0,1])
axes[1].set_ylim([0,1])
plt.show()