In [None]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import cross_val_score, KFold

import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
git_df = pd.read_csv('git_stats.csv', index_col=0)
git_df.head()

In [None]:
target = 'stars'
features = ['commits', 'branches', 'releases', 'watchers', 'forks', 'issues_open', 'issues_closed']
n_neighbors = 5
q_75 = int(len(git_df)*.75)

In [None]:
# Normalizar
feat_df = git_df[features]
feat_df = (feat_df - feat_df.mean())/feat_df.std()
git_norm_df = git_df.copy()
git_norm_df[features] = feat_df
git_norm_df.head()

In [None]:
sns.set(style="white")
corr = git_norm_df[['stars'] + features].corr()
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(150, 275, s=80, l=55, as_cmap=True)

sns.heatmap(corr, cmap=cmap, vmax=1, vmin=0, center=0, annot=True)
plt.show()

In [None]:
strong_correlated_feats = [
    ['watchers', 'forks'],
    ['watchers'],
    ['forks']
]

correlated_feats = [
    ['watchers', 'forks', 'issues_open'],
    ['watchers', 'issues_open'],
    ['forks', 'issues_open'],
    ['issues_open']
]

uncorrelated_feats = [
    ['commits', 'branches', 'releases'],
    ['commits', 'branches'],
    ['commits', 'releases'],
    ['branches', 'releases'],
    ['commits'],
    ['branches'],
    ['releases']
]

mixed_feats = [
    ['commits', 'branches', 'releases', 'watchers', 'forks', 'issues_open', 'issues_closed'], 
    ['releases', 'watchers', 'forks', 'issues_open'],
    ['branches', 'releases', 'watchers', 'forks'],
    ['branches', 'releases', 'issues_open', 'issues_closed'],
    ['commits','watchers', 'forks'],
    ['commits', 'branches', 'watchers', 'forks']
]

In [None]:
hyper_params = np.arange(1, 21)
folds = np.arange(3, 22, 2)

In [None]:
def eval_setting(df, feat_list, hyper_params, folds):
    index = pd.MultiIndex.from_product([range(7), hyper_params, folds], names=['feat_list', 'hyper_param', 'fold'])
    rmse = pd.DataFrame(columns=['avg', 'std'], index=index)

    for i, feats in enumerate(feat_list):
        for hp in hyper_params:
            for fold in folds:
                kf = KFold(fold, shuffle=True, random_state=1)
                model = KNeighborsRegressor(n_neighbors=hp)
                mses = cross_val_score(model,
                                       df[feats],
                                       df[target],
                                       scoring="neg_mean_squared_error",
                                       cv=kf)
                rmses = np.sqrt(np.absolute(mses))
                rmse[i, hp, fold, 'avg'] = np.mean(rmses)
                rmse[i, hp, fold, 'std'] = np.std(rmses)
    return rmse

In [None]:
strong_rmse = eval_setting(git_norm_df, strong_correlated_feats, hyper_params, folds)
corr_rmse   = eval_setting(git_norm_df, correlated_feats, hyper_params, folds)
uncorr_rmse = eval_setting(git_norm_df, uncorrelated_feats, hyper_params, folds)
mixed_rmse  = eval_setting(git_norm_df, mixed_feats, hyper_params, folds)