In [None]:
import matplotlib.pyplot as plt
import sys
import config

import numpy  as np
import pandas as pd
import seaborn as sns

import data_analysis_for_paper as dafp

sns.set_context("paper")
sns.set_style("ticks")

from collections import defaultdict
current_pal = sns.color_palette()

#%%
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = '%s%%' % n
    return percentile_

clean_lang = lambda x: x.replace("MergedCl","").replace("Cl","")



In [None]:
#%%
LANG_ORDER = ["Rust","JS","Ruby"]

DATASETS = {}

for lang in LANG_ORDER:
    print lang

    adoption_file = config.FINAL_DATA+"cleaned_{0}_dependency_final.csv.gz"
    release_file = config.FINAL_DATA+"cleaned_{0}_release_final.csv.gz"
    opts = {"na_filter": False}
    #%%

    df_adoption =  pd.read_csv(adoption_file.format(lang), **opts)
    df_release =  pd.read_csv(release_file.format(lang), **opts)

    df_fixed = pd.read_csv(config.WORKING_DATA+"fixed_adopted_{0}_meta.csv".format(lang), sep="\t",**opts)
    df_fixed.loc[pd.isnull(df_fixed.orig_ver_string), "orig_ver_string"] = ""

    #%%

    df_fixed = df_fixed[df_fixed.commit_ts <= 1459468800]
    df_fixed = df_fixed[df_fixed.release_ts_y <= 1459468800]

    df_adoption = df_adoption[df_adoption.commit_ts <= 1459468800]
    df_release = df_release[df_release.release_ts <= 1459468800]


    df_fixed.loc[:,"commit_ts"] = pd.to_datetime(df_fixed.commit_ts, unit="s")
    df_fixed.loc[:,"release_ts_y"] = pd.to_datetime(df_fixed.release_ts_y, unit="s")
    df_adoption.loc[:,"commit_ts"] = pd.to_datetime(df_adoption.commit_ts, unit="s")
    df_release.loc[:,"release_ts"] = pd.to_datetime( df_release.release_ts, unit="s")
    
    DATASETS[lang] = (df_fixed, df_adoption, df_release)

In [None]:
general_df = []
for lang in LANG_ORDER:
    lang_ = lang.replace("MergedCl","").replace("Cl","")
    (df_fixed, df_adoption, df_release) =   DATASETS[lang] 

    print "\nLang", lang

    rename_rules = {"adopted_name":"project_name", "adopted_github":"project_github"}

    no_pub = df_release.query("is_published==1")[["project_name", "project_github"]].drop_duplicates().shape[0]
    
    no_gh_ = df_release.query("is_published==0")[["project_name", "project_github"]].drop_duplicates()
    no_gh2_ =  df_fixed.query("is_published==0")[["project_name", "project_github"]].drop_duplicates()
    
    no_gh = pd.concat((no_gh_, no_gh2_)).drop_duplicates().shape[0]
    
    print "\nPublished projects", no_pub
    print "non-Published projects",no_gh
    #print "\nadoption projects", df_adoption[["project_name", "project_github"]].drop_duplicates().shape


    uniq_dep = df_fixed[["project_name", "project_github"]].drop_duplicates()
    uniq_dep_adopt = df_fixed[["adopted_name", "adopted_github"]].drop_duplicates().rename(columns=rename_rules)
    
    no_total =  pd.concat([df_adoption[["project_name", "project_github"]], df_release[["project_name", "project_github"]],uniq_dep_adopt,   uniq_dep]).drop_duplicates().shape[0]

    print "\nAll initial sample",no_total
    general_df.append([lang_, no_total, no_pub, no_gh])

In [None]:
general_stats = pd.DataFrame.from_records(general_df,columns=["lang", "total", "published","github"])
print general_stats

In [None]:
#eg_evo, ver_evo = dafp.get_evo_data(lang)
summary_df = []

for lang in LANG_ORDER:
    lang_ = lang.replace("MergedCl","").replace("Cl","")
    (df_fixed, df_adoption, df_release) =   DATASETS[lang] 

    print "\nLang", lang

    rename_rules = {"adopted_name":"project_name", "adopted_github":"project_github"}

    funcs =  [np.min, percentile(5), np.mean, np.median,percentile(95),np.max  ]
    lbls = ["min","5p","mean","median","95p","max"]

    vers1 =  df_fixed[["project_name", "project_github", "adopted_github", "adopted_ver"]].groupby(["project_github",  "adopted_github"]).agg({"adopted_ver":lambda x: len(x.unique())}).reset_index()
    implicit = [fn(vers1["adopted_ver"].values) for fn in funcs]
    vers1 =  df_fixed[["project_name", "project_github", "adopted_github", "orig_ver_string"]].groupby([ "project_github",  "adopted_github"]).agg({"orig_ver_string":lambda x: len(x.unique())}).reset_index()
    explicit = [fn(vers1["orig_ver_string"].values) for fn in funcs]

    df_stats =  pd.DataFrame({"label":lbls, "implicit":implicit, "explicit":explicit})
    df_stats.loc[:,"lang"] = lang_
    summary_df.append(df_stats)
    print df_stats


In [None]:
updates_df = pd.concat(summary_df)
updates_df.to_csv(config.FIGURES+"update_counts.csv", index=None)

In [None]:
pivoted = pd.pivot_table(updates_df, index="lang", columns="label")

print pivoted
mi = pd.MultiIndex.from_product([['explicit', 'implicit'], ['min','5p','median','mean', '95p','max']])
pivoted_order = pivoted.reindex_axis(mi, 1)

with open(config.FIGURES+"update.tex","w") as fp:
    fp.write(pivoted_order.to_latex(float_format='%.2f'))


In [None]:
stacked = pd.pivot_table(pd.melt(updates_df,id_vars=["lang","label"]), index=["variable","lang"], columns="label")
mi2 = pd.MultiIndex.from_product([['value'], ['min','5p','median','mean', '95p','max']])
stacked = stacked.reindex_axis(mi2,1)
stacked
with open(config.FIGURES+"update_stacked.tex","w") as fp:
    fp.write(stacked.to_latex(float_format='%.2f'))

# TOTAL GRAPH

In [None]:

rows = []
for lang_ in LANG_ORDER:
    lang = clean_lang(lang_)
    re, ve = dafp.get_evo_data(lang)
    last_v =  re.sort_values("date").tail(1)[["nodes","unique_relations","github_nodes","published_nodes"]]
    last_v = last_v.rename(columns={"nodes":"projects","unique_relations":"project_depedencies"})
    last_r = ve.sort_values("date").tail(1)[["nodes","version_relations"]]
    last_r = last_r.rename(columns={"nodes":"releases","version_relations":"release_dependencies"})
    
    row =  pd.concat((last_v, last_r), axis=1)
    if lang == "Rust":
        row.loc[:,"published_nodes"] = row.loc[:,"projects"]
        row.loc[:,"github_nodes"] = 0
    row.loc[:,"lang"] = lang
    rows.append(row)
    
totals = pd.concat(rows).set_index("lang")

In [None]:
print totals
print general_stats
totals2 = pd.merge(totals, general_stats.set_index("lang")[["github","published"]], left_index=True, right_index=True)

In [None]:
ltable = (totals2*1.0).to_latex(float_format=lambda x:  '{:,.0f}'.format(x))
print ltable

with open(config.TABLES+"general_stat.tex","w") as fp:
    fp.write(ltable)

In [None]:
totals2

In [None]:
print totals.project_depedencies / totals.projects
print totals.release_dependencies / totals.releases

# FINAL MONTH STATS

In [None]:
final_reg = pd.read_csv(config.FIGURES+"last_month_data_reg.csv.gz")
final_ver = pd.read_csv(config.FIGURES+"last_month_data_ver.csv.gz")

In [None]:
final_reg[["outdegree", "indegree","dependencies_unique", "dependents_unique", "lang","dependencies_unique_direct","dependents_unique_direct"]].groupby("lang").mean()

In [None]:
final_ver[["lang","indegree"]].groupby("lang").count()

In [None]:
final_ver.lang.value_counts()