In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.linear_model import LassoCV

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
data = pd.read_csv('Data/train.csv')

In [3]:
# convert the categorical into the numerical
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()

cat_feat = ['gender','ever_married','work_type','Residence_type','smoking_status']
data[cat_feat] = oe.fit_transform(data[cat_feat])

In [4]:
data['gender'] = data['gender'].astype(int)
data['ever_married'] = data['ever_married'].astype(int)
data['work_type'] = data['work_type'].astype(int)
data['Residence_type'] = data['Residence_type'].astype(int)
data['smoking_status'] = data['smoking_status'].astype(int)

In [5]:
# standardising the numerical feature
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
scale_feat = ['age','avg_glucose_level','bmi']
feat = ['gender','age','hypertension','heart_disease','ever_married',
       'work_type','Residence_type','avg_glucose_level','bmi',
       'smoking_status']
data[scale_feat] = scale.fit_transform(data[scale_feat]) 
train_X = data[feat]
train_y = data['stroke']

In [6]:
data.shape

(15304, 12)

In [7]:
test = pd.read_csv('Data/test.csv')

test[cat_feat] = oe.transform(test[cat_feat])

test['gender'] = test['gender'].astype(int)
test['ever_married'] = test['ever_married'].astype(int)
test['work_type'] = test['work_type'].astype(int)
test['Residence_type'] = test['Residence_type'].astype(int)
test['smoking_status'] = test['smoking_status'].astype(int)


In [8]:
X = data[feat]
Y = data['stroke']

cv_scores, roc_auc_scores = list(), list()
preds = list()

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

for i in range(5):

    for train_ix, test_ix in skf.split(X, Y):

        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]

        ## Scaling the data
        X_train[scale_feat] = scale.fit_transform(X_train[scale_feat])
        X_test[scale_feat] = scale.transform(X_test[scale_feat])


        test[scale_feat] = scale.transform(test[scale_feat]) 
        test_X = test[feat]

        ## Building logistic model
        LogReg =  LGBMClassifier(n_estimators = 1000,
                                    max_depth = 7,
                                    learning_rate = 0.01,
                                    num_leaves = 20,
                                    lambda_l1 = 3,
                                    lambda_l2 = 3,
                                    bagging_fraction = 0.7,
                                    feature_fraction = 0.7)
        LogReg.fit(X_train, Y_train)

        ## Predicting on X_test and test
        logit_pred_1 = LogReg.predict_proba(X_test)[:, 1]
        logit_pred_2 = LogReg.predict_proba(test_X)[:, 1]

        ## Computing roc-auc score
        roc_auc_scores.append(roc_auc_score(Y_test, logit_pred_1))
        preds.append(logit_pred_2)

    cv_scores.append(np.mean(roc_auc_scores))
lgb_cv_score = np.mean(cv_scores)    
lgbm_preds = pd.DataFrame(preds).apply(np.mean, axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

In [9]:
X = data[feat]
Y = data['stroke']

cv_scores, roc_auc_scores = list(), list()
preds = list()

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

for i in range(5):

    for train_ix, test_ix in skf.split(X, Y):

        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]

        ## Scaling the data
        X_train[scale_feat] = scale.fit_transform(X_train[scale_feat])
        X_test[scale_feat] = scale.transform(X_test[scale_feat])


        test[scale_feat] = scale.transform(test[scale_feat]) 
        test_X = test[feat]

        ## Building logistic model
        LogReg =  LassoCV(precompute="auto",
                    fit_intercept=True,
                    max_iter=1000,
                    verbose=False,
                    eps=1e-04,
                    n_alphas=1000,
                    n_jobs=8,)
        LogReg.fit(X_train, Y_train)

        ## Predicting on X_test and test
        logit_pred_3 = LogReg.predict(X_test)
        logit_pred_4 = LogReg.predict(test_X)

        ## Computing roc-auc score
        roc_auc_scores.append(roc_auc_score(Y_test, logit_pred_3))
        preds.append(logit_pred_4)

    cv_scores.append(np.mean(roc_auc_scores))
lsso_cv_score = np.mean(cv_scores)    
lsso_preds = pd.DataFrame(preds).apply(np.mean, axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

In [11]:
wtot = lgb_cv_score + lsso_cv_score
w1 = lgb_cv_score / wtot
w2 = lsso_cv_score / wtot


ensemble_pred = w1*lgbm_preds + w2*lsso_preds 
test['stroke'] = ensemble_pred
test[['id','stroke']].to_csv('result.csv',index=False)