In [1]:
import pandas as pd
from sklearn.utils import resample 
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('framingham.csv')

In [6]:
data = data.fillna(data.mean())

In [7]:
X = data[['cigsPerDay', 'prevalentHyp','currentSmoker','sysBP','diaBP']].values

In [8]:
y = data['diabetes'].values

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)


# Logistic regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
model = LogisticRegression()

In [14]:
model.fit(X_train, y_train)

LogisticRegression()

In [15]:
prediction = model.predict(X_test)

In [16]:
mean_squared_error(prediction, y_test)

0.024174528301886794

In [17]:
model.score(X_test, y_test)

0.9758254716981132

# K Fold cross validation

In [15]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=2)

score = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    score.append(
        mean_squared_error(model.fit(X_train, y_train).predict(X_test), y_test)
    )

np.mean(score)

NameError: name 'model' is not defined

# Bagging

In [13]:
from sklearn.ensemble import BaggingClassifier

In [14]:
model = BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=5, warm_start=False)

In [15]:
model.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=5)

In [16]:
prediction = model.predict(X_test)

In [17]:
from sklearn.metrics import mean_squared_error
mean_squared_error(prediction, y_test)

0.025943396226415096

In [18]:
model.score(X_test, y_test)

0.9740566037735849

# Stacking

In [13]:
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

In [14]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10))
]

In [15]:
model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [16]:
model.fit(X_train, y_train)

StackingClassifier(estimators=[('rf', RandomForestClassifier(n_estimators=10))],
                   final_estimator=LogisticRegression())

In [17]:
prediction = model.predict(X_test)

In [18]:
from sklearn.metrics import mean_squared_error
mean_squared_error(prediction, y_test)

0.027712264150943397

In [19]:
model.score(X_test, y_test)

0.9722877358490566

# Boosting

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [62]:
model = GradientBoostingClassifier(n_estimators=210, learning_rate=2, criterion='mse').fit(X_train, y_train)

In [63]:
model.score(X_test, y_test)

0.9711084905660378