# **Load Data**

In [None]:
# Using: https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
link = 'https://drive.google.com/file/d/1DY2Ywb2mhmD2dqgkz3M90dIAAsEsIQxQ/view?usp=share_link' # The shareable 

In [None]:
# Getting ID from link
fluff, part = link.split('d/')
id, fluff = part.split('/view')
print(id)

1DY2Ywb2mhmD2dqgkz3M90dIAAsEsIQxQ


# **Create Pandas Dataframe**


In [None]:
import pandas as pd

In [None]:
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
df = pd.read_csv('Filename.csv')
# Dataset is now stored in a Pandas Dataframe

In [None]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,ever_worked,child_job,govt_job,private_job,self_employed,residence_type,avg_glucose_level,bmi,smokes,formerly_smoked,stroke
0,1,1,55,0,0,1,1,0,0,1,0,1,89.17,31.5,0,0,0
1,2,0,42,0,0,0,1,0,0,1,0,1,98.53,18.5,0,0,0
2,3,0,24,0,0,0,1,0,0,1,0,1,97.55,26.2,0,0,0
3,4,0,33,0,0,1,1,0,0,1,0,0,86.97,42.2,0,0,0
4,5,0,20,0,0,0,1,0,0,1,0,0,84.07,27.6,1,0,0


# **Lasso Regression (Logistic Regression with L1-regularization)**

In [None]:
X = df.drop(columns = ["id", "stroke"])
y = df["stroke"]

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
# Perform GridSearchCV to tune best-fit LR model
param = {'C': [10**-2,10**-1,10**0,10**1,10**2]}

lr_model = LogisticRegression(penalty='l1', solver='liblinear')
gs_model = GridSearchCV(estimator=lr_model, param_grid=param)
gs_model.fit(X_std, y)

# Train a LR model with best parameters
model = LogisticRegression(**gs_model.best_params_, penalty='l1', solver='liblinear')
model.fit(X_std, y)

LogisticRegression(C=0.01, penalty='l1', solver='liblinear')

In [None]:
coef = model.coef_[0]
coef

array([0.        , 0.34058192, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [None]:
print("Redundant Feature Count: ", sum(model.coef_[0]==0))
print("Redundant Feature Names: ", list(pd.Series(X.columns)[list(coef==0)]))

Redundant Feature Count:  14
Redundant Feature Names:  ['gender', 'hypertension', 'heart_disease', 'ever_married', 'ever_worked', 'child_job', 'govt_job', 'private_job', 'self_employed', 'residence_type', 'avg_glucose_level', 'bmi', 'smokes', 'formerly_smoked']
