In [1]:
# Funciones auxiliares
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
from scipy.stats import boxcox
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
import sklearn.model_selection as ms
import sklearn.preprocessing as pr
if not(os.path.isdir("tarea")):
    mkdir("tarea")
def dataScaled(df):
    cat = [var for var in df.columns if not(np.issubdtype(df[var].dtype,np.number))]
    num = df.drop(cat,axis=1)
    # Creating dictionary to store the different data frames
    data = {"original":df}
    # Standarizing data to have mean 0 and variance 1
    scaler = pr.StandardScaler()
    scaler.fit(num)
    data["standarized"] = pd.DataFrame(scaler.transform(num),index=num.index,columns=num.columns)
    data["standarized"][cat] = df[cat]
    data["standarized"] = data["standarized"][df.columns]
    # Centering data to have variance 1 
    scaler = pr.StandardScaler(with_mean=False)
    scaler.fit(num)
    data["withmean"] = pd.DataFrame(scaler.transform(num),index=num.index,columns=num.columns)
    data["withmean"][cat] = df[cat]
    data["withmean"] = data["withmean"][df.columns]
    return data

def boxcoxLambdaTable(df,resCol,alpha=0.05):
    names = []
    lambdas = []
    intervalsBot = []
    intervalsTop = []
    for col in df.columns:
        if col != resCol and np.issubdtype(df[col].dtype,np.number):
            if (df[col]>0).prod():
                names.append(col)
                bx = boxcox(df[col],alpha=alpha)
                lambdas.append(bx[1])
                intervalsBot.append(bx[2][0])
                intervalsTop.append(bx[2][1])
            else:
                print("Can't convert column {0}: not entirely positive".format(col) )
    fin = pd.DataFrame.from_dict({"lambda":lambdas,"Lower confidence interval, alpha = {0}".format(alpha):intervalsBot,"Upper confidence interval, alpha = {0}".format(alpha):intervalsTop})
    fin.index = names
    return fin.transpose()

def bootstrap(df):
    return resample(df,n_samples=df.shape[0]).reset_index(drop=True)

def KFold_strat(X,y,**kwargs):
    splitter = ms.StratifiedKFold(**kwargs)
    iterator = splitter.split(X,y)
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for train_index, test_index in iterator:
        X_train.append(X.iloc[train_index,:])
        y_train.append(y.iloc[train_index])
        X_test.append(X.iloc[test_index,:])
        y_test.append(y.iloc[test_index])
    return X_train,X_test,y_train,y_test

def KFold(X,y,**kwargs):
    splitter = ms.KFold(**kwargs)
    iterator = splitter.split(X,y)
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for train_index, test_index in iterator:
        X_train.append(X.iloc[train_index,:])
        y_train.append(y.iloc[train_index])
        X_test.append(X.iloc[test_index,:])
        y_test.append(y.iloc[test_index])
    return X_train,X_test,y_train,y_test

def apparentErrorRate(mod,df,resCol):
    X = df.drop(resCol,axis=1)
    y = df[resCol]
    fit = mod.fit(X,y)
    res = fit.predict(X)
    fin = np.mean(y != res)
    classes = list(set(y))
    perClass = np.zeros(len(classes))
    for j,val in enumerate(classes):
        curr = X[y==val]
        res = fit.predict(curr)
        perClass[j] = np.mean(y[y==val] != res)
    final = [fin]
    final.extend(perClass)
    return final,[0 for f in final]

def trainTestErrorRates(mod,df,resCol,n=100,size=0.5,equalRatios=False):
    X = df.drop(resCol,axis=1)
    y = df[resCol]
    fin = np.zeros(n)
    classes = list(set(y))
    perClass = [np.zeros(n) for val in classes]
    for i in range(n):
        if size < 1.0 and size > 0:
            if equalRatios:
                X_train, X_test, y_train, y_test = ms.train_test_split(X,y,train_size=size,stratify=y)
            else:
                X_train, X_test, y_train, y_test = ms.train_test_split(X,y,train_size=size)
            fit = mod.fit(X_train,y_train)
            res = fit.predict(X_test)
            fin[i] = np.mean(y_test != res)
            for j,val in enumerate(classes):
                curr = X[y==val]
                res = fit.predict(curr)
                perClass[j][i] = np.mean(y[y==val] != res)
        else:
            raise ValueError("Size {0} is not in (0,1)".format(size))
    final1 = [np.mean(fin)]
    final2 = [np.std(fin)] 
    for cla in perClass:
        final1.append(np.mean(cla))
        final2.append(np.std(cla))
    return final1, final2

def bootstrapErrorRate(mod,df,resCol,n=100):
    fin = []
    for i in range(n):
        newdf = bootstrap(df)
        errors = apparentErrorRate(mod,newdf,resCol)
        fin.append(errors)
    fin = np.transpose(fin)
    final1 = [np.mean(f) for f in fin]
    final2 = [np.std(f) for f in fin]
    return final1 , final2

def crossValidationErrorRate(mod,df,resCol,k=2,equalRatios=True,n=100):
    errors = np.zeros(n)
    classes = list(set(df[resCol]))
    perClass = [np.zeros(n) for val in classes]
    if equalRatios:
        splitFunc = KFold_strat
    else:
        splitFunc = KFold
    for i in range(n):
        X_train,X_test,y_train,y_test = splitFunc(df.drop(resCol,axis=1),df[resCol],n_splits=k,shuffle=True)
        temps = np.zeros(k)
        classTemps = [np.zeros(k) for val in classes]
        for j in range(k):
            fit = mod.fit(X_train[j],y_train[j])
            res = fit.predict(X_test[j])
            temps[j] = np.mean(y_test[j] != res)
            for l,val in enumerate(classes):
                curr = X_test[j][y_test[j]==val]
                res = fit.predict(curr)
                classTemps[l][j] = np.mean(y_test[j][y_test[j]==val] != res)
        errors[i] = np.mean(temps)
        for l,val in enumerate(classes):
            perClass[l][i] = np.mean(classTemps[l])
    final1 = [np.mean(errors)]
    final2 = [np.std(errors)]
    final1.extend([np.mean(v) for v in perClass])
    final2.extend([np.std(v) for v in perClass])
    return final1, final2

def resamplingComparison(model,df,resCol,k=5,n=100,size=0.5,equalRatios = True,stds=False):
    classes = list(set(df[resCol]))
    names = ["Normal","Bootstrap","Training/Test, fraction = {0}".format(size),"Cross validation, k = {0}".format(k)]
    errors = [apparentErrorRate(model,df,resCol), 
              bootstrapErrorRate(model,df,resCol,n=n), 
              trainTestErrorRates(model,df,resCol,n=n,size=size,equalRatios=equalRatios),
              crossValidationErrorRate(model,df,resCol,k=k,equalRatios=equalRatios,n=n)]
    cols = ["Global"]
    cols += ["Class {0}".format(c) for c in classes]
    cols += ["Global STD"]
    cols += ["Class {0} STD".format(c) for c in classes]
    res = pd.DataFrame(columns = cols)
    for i,tab in enumerate(errors):
        res.loc[i] = tab[0] + tab[1]
    res["method"] = names
    res = res[np.roll(res.columns.to_list(),1)]
    if not(stds):
        res = res.iloc[:,range(res.shape[1])[:-(len(classes)+1)]]
    return res

def modelComparison(models,dfs,resCol,errorFunc,names=[],stds=False):
    classes = list(set(df[resCol]))
    if type(dfs)!= list:
        dfs = [dfs for m in models]
    if not bool(names):
        names = [str(mod).split("(")[0] for mod in models]
    elif len(names) != len(models) or len(dfs)!=len(models):
        raise ValueError("length of names, models and dfs do not match")
    cols = ["Global"]
    cols += ["Class {0}".format(c) for c in classes]
    cols += ["Global STD"]
    cols += ["Class {0} STD".format(c) for c in classes]
    res = pd.DataFrame(columns = cols)
    for i,mod in enumerate(models):
        tab = errorFunc(mod,dfs[i],resCol)
        res.loc[i] = tab[0] + tab[1]
    res["Model"] = names
    res = res[np.roll(res.columns.to_list(),1)]
    if not(stds):
        res = res.iloc[:,range(res.shape[1])[:-(len(classes)+1)]]
    return res
    

In /home/aldo/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In /home/aldo/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/aldo/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The pgf.debug rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In /home/aldo/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The verbose.level rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/aldo/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The verbose.fileo rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


In [2]:
# Problema 1
df = pd.read_csv("pimate.csv")
df = df.append(pd.read_csv("pimatr.csv"),ignore_index=True)
print(df.head())

   npreg  glu  bp  skin   bmi    ped  age type
0      5   86  68    28  30.2  0.364   24   No
1      7  195  70    33  25.1  0.163   55  Yes
2      5   77  82    41  35.8  0.156   35   No
3      0  165  76    43  47.9  0.259   26   No
4      0  107  60    25  26.4  0.133   23   No


In [3]:
data = dataScaled(df)

In [4]:
models = [
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    LogisticRegression(dual=False,max_iter=10**6),
    SVC()
]

In [5]:
df1 = data["original"].copy()
df1["ped*age"] = df1["ped"]*df1["age"]
df2 = data["original"].copy()
df2["ped*bp"] = df2["ped"]*df2["bp"]
df3 = data["original"][["glu","bmi","ped","age","type"]].copy()
df3["age^2"] = df3["age"]*df3["age"]
df4 = data["original"].copy()
dfs = [df1,df2,df3,df4]

In [6]:
tables = [resamplingComparison(models[i],dfs[i],"type",n=5) for i in range(len(models))]
tables

[                          method    Global  Class Yes  Class No
 0                         Normal  0.204887   0.412429  0.101408
 1                      Bootstrap  0.106391   0.200387  0.057833
 2  Training/Test, fraction = 0.5  0.230075   0.411299  0.120000
 3        Cross validation, k = 5  0.212756   0.417841  0.110423,
                           method    Global  Class Yes  Class No
 0                         Normal  0.219925   0.344633  0.157746
 1                      Bootstrap  0.120301   0.175263  0.093684
 2  Training/Test, fraction = 0.5  0.239098   0.361582  0.167887
 3        Cross validation, k = 5  0.231927   0.350349  0.172958,
                           method    Global  Class Yes  Class No
 0                         Normal  0.206767   0.389831  0.115493
 1                      Bootstrap  0.106579   0.186202  0.064244
 2  Training/Test, fraction = 0.5  0.214286   0.380791  0.121690
 3        Cross validation, k = 5  0.204542   0.384381  0.114930,
                      

In [7]:
n = 500
names = ["Analisis de Discriminante Lineal","Naive Bayes","Regresión Logística","Support Vector Machine"]
errors = [apparentErrorRate,lambda model,df,resCol: bootstrapErrorRate(model,df,resCol,n=n), 
              lambda model,df,resCol : trainTestErrorRates(model,df,resCol,n=n,size=0.75,equalRatios=True),
              lambda model,df,resCol : crossValidationErrorRate(model,df,resCol,k=5,equalRatios=True,n=50)]

In [8]:
res = []
for err in errors[:-1]:
    print(str(err))
    res.append(modelComparison(models,dfs,"type",err,names=names))

<function apparentErrorRate at 0x7f7849376c80>
<function <lambda> at 0x7f784939a6a8>
<function <lambda> at 0x7f784939a620>


In [9]:
res

[                              Model    Global  Class Yes  Class No
 0  Analisis de Discriminante Lineal  0.204887   0.412429  0.101408
 1                       Naive Bayes  0.219925   0.344633  0.157746
 2               Regresión Logística  0.206767   0.389831  0.115493
 3            Support Vector Machine  0.216165   0.480226  0.084507,
                               Model    Global  Class Yes  Class No
 0  Analisis de Discriminante Lineal  0.101107   0.198908  0.052251
 1                       Naive Bayes  0.113115   0.173037  0.083392
 2               Regresión Logística  0.100558   0.190955  0.055965
 3            Support Vector Machine  0.108150   0.237657  0.044203,
                               Model    Global  Class Yes  Class No
 0  Analisis de Discriminante Lineal  0.215353   0.409910  0.106304
 1                       Naive Bayes  0.233895   0.347921  0.170062
 2               Regresión Logística  0.208105   0.383921  0.114377
 3            Support Vector Machine  0.223835

In [10]:
tab = "p{3cm}"
for col in res[0].columns:
    tab += "|c"
res[0].to_latex(buf=os.path.join("tarea","41-apparent.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)
res[1].to_latex(buf=os.path.join("tarea","41-boot.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)
res[2].to_latex(buf=os.path.join("tarea","41-traintest.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)

In [11]:
# Problema 2
df = pd.read_csv("cad1.csv",index_col=0)
resCol="CAD"
dfCoded = df.copy()
for col in df.columns: 
    if col!=resCol and df[col].dtype==np.dtype("O"):
        dfCoded[col] = df[col].astype("category").cat.codes
print(dfCoded.head())

   Sex  AngPec  AMI  QWave  QWavecode  STcode  STchange  SuffHeartF  \
1    1       1    1      0          1       1         0           0   
2    1       0    1      0          1       1         0           0   
3    0       1    0      0          1       1         0           0   
4    1       1    1      0          1       0         0           0   
5    1       1    1      0          1       0         0           0   

   Hypertrophi  Hyperchol  Smoker  Inherit  Heartfail CAD  
1            0          0       0        0          0  No  
2            0          0       0        0          0  No  
3            0          0       0        0          0  No  
4            0          0       0        0          0  No  
5            0          0       0        0          0  No  


In [12]:
models = [
    GaussianNB(),
    LogisticRegression(dual=False,max_iter=10**6),
    SVC()
]

In [13]:
df1 = dfCoded.copy()
df1["Sex*AMI"] = df1["Sex"]*df1["AMI"]
df2 = dfCoded[["AngPec","AMI","STcode","STchange","Hyperchol","CAD"]].copy()
df3 = dfCoded.copy()
dfs = [df1,df2,df3]

In [14]:
n = 500
names = ["Naive Bayes","Regresión Logística","Support Vector Machine"]
errors = [apparentErrorRate,
              lambda model,df,resCol : trainTestErrorRates(model,df,resCol,n=n,size=0.75,equalRatios=True),
              lambda model,df,resCol : crossValidationErrorRate(model,df,resCol,k=5,equalRatios=True,n=n)]

In [15]:
res = []
for err in errors:
    print(str(err))
    res.append(modelComparison(models,dfs,"CAD",err,names=names))

<function apparentErrorRate at 0x7f7849376c80>
<function <lambda> at 0x7f784939a9d8>
<function <lambda> at 0x7f78493a09d8>


In [16]:
tab = "p{3cm}"
for col in res[0].columns:
    tab += "|c"
res[0].to_latex(buf=os.path.join("tarea","42-apparent.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)
res[1].to_latex(buf=os.path.join("tarea","42-traintest.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)
res[2].to_latex(buf=os.path.join("tarea","42-crossval.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)

In [17]:
# Problema 3
df = pd.read_csv("Glucose1.txt",index_col="Patient")
#df["Class"] = df["Class"].astype("O")
data = dataScaled(df)
print(data["original"].head())

         Weight  Fglucose  GlucoseInt  InsulinResp  InsulineResist  Class
Patient                                                                  
1          0.81        80         356          124              55      3
2          0.95        97         289          117              76      3
3          0.94       105         319          143             105      3
4          1.04        90         356          199             108      3
5          1.00        90         323          240             143      3


In [18]:
modpredict = data["original"][["InsulinResp","Class"]].copy()
interactions = ["Fglucose*InsulinResp","GlucoseInt*InsulinResp"]
for inter in interactions:
    columns = inter.split("*")
    modpredict[inter] = data["original"][columns].product(axis=1)

In [19]:
n = 500
names = ["Regresión logística"]
errors = [apparentErrorRate,lambda model,df,resCol: bootstrapErrorRate(model,df,resCol,n=n), 
              lambda model,df,resCol : trainTestErrorRates(model,df,resCol,n=n,size=0.75,equalRatios=True),
              lambda model,df,resCol : crossValidationErrorRate(model,df,resCol,k=5,equalRatios=True,n=n)]

In [20]:
res = []
for err in errors:
    print(str(err))
    res.append(modelComparison([LogisticRegression(dual=False,max_iter=10**6)],[modpredict],"Class",err,names=names))

<function apparentErrorRate at 0x7f7849376c80>
<function <lambda> at 0x7f7847334620>
<function <lambda> at 0x7f78473347b8>
<function <lambda> at 0x7f7847334840>


In [21]:
tab = "p{3cm}"
for col in res[0].columns:
    tab += "|c"
res[0].to_latex(buf=os.path.join("tarea","43-apparent.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)
res[1].to_latex(buf=os.path.join("tarea","43-boot.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)
res[2].to_latex(buf=os.path.join("tarea","43-traintest.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)
res[3].to_latex(buf=os.path.join("tarea","43-crossval.tex"),float_format="{:0.4f}".format,index=False,column_format=tab)