In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from pandas_profiling import ProfileReport
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, Lasso
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

In [None]:
df = pd.read_csv("/kaggle/input/playground-series-s3e2/train.csv")
df.drop("id", axis=1, inplace=True)
df.head()

In [None]:
original_df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
original_df.drop("id", axis=1, inplace=True)
original_df["bmi"] = original_df["bmi"].fillna(original_df["bmi"].median())
df = df.append(original_df[original_df[["stroke"]].all(1)])
df.head()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#df["bmi"] = df["bmi"].fillna(df["bmi"].mean())

In [None]:
"""profile = ProfileReport(df)
profile.to_file(output_file="StrokePrediction.html")"""

In [None]:
def freq(df):
    plt.figure()
    for x in df.columns:
        if x not in df.select_dtypes(include='float'):
            sns.countplot(data=df, x=x, hue='stroke')
        else:
             sns.histplot(data=df, x=x, hue='stroke', multiple="stack")   
        plt.show()


In [None]:
#freq(df)

In [None]:
def evaluteModel(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, cv = 5, scoring='roc_auc')
    print(model, ":", scores.mean())
    model.fit(X_train, y_train)

In [None]:
df["smoking_status"].value_counts()

In [None]:
def FE(df):
    df["ge30"] = pd.Series(df["age"]>=30, dtype=int)
    ed_work_type = {'Private':1, 'Self-employed':1, 'Govt_job':1, 'Never_worked':0, 'children':0}
    df["worked"] = [ed_work_type[x] for x in df["work_type"]]
    #df.drop("work_type", axis=1, inplace=True)
    #df.drop("age", axis=1, inplace=True)
    df = pd.get_dummies(df, drop_first=True)
    return df

In [None]:
X = df.drop("stroke", axis=1)
y = df["stroke"]
X, y = shuffle(X, y, random_state=42)
X = FE(X)

In [None]:
"""temp = X.copy()
temp["stroke"] = y
freq(temp)"""

In [None]:
X.head()

In [None]:
def classifiersScore(X, y):
    evaluteModel(KNeighborsClassifier(n_neighbors=9), X, y)
    evaluteModel(GaussianNB(), X,y)
    evaluteModel(RandomForestClassifier(random_state=0), X,y)    
    evaluteModel(DecisionTreeClassifier(random_state=0), X,y)
    #evaluteModel(Lasso(random_state=0), X,y)
    evaluteModel(LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000), X,y)
    evaluteModel(CatBoostClassifier(random_state=0, verbose=False), X, y)
    evaluteModel(XGBClassifier(random_state=0), X, y)
    

In [None]:
#classifiersScore(X, y)

In [None]:
"""cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)"""

In [None]:
scaler = MinMaxScaler()
cont_col = X.select_dtypes(include='float').columns
scaler.fit(X[cont_col])
X[cont_col] = scaler.transform(X[cont_col])
X.head()

In [None]:
sm = SMOTE(random_state=42)
X,y = sm.fit_resample(X,y)

In [None]:
#classifiersScore(X, y)

In [None]:
def submit(model):
    test = pd.read_csv("/kaggle/input/playground-series-s3e2/test.csv")
    ids = test["id"]
    test.drop("id", axis=1, inplace=True)
    test = FE(test)
    cont_col = test.select_dtypes(include='float').columns
    test[cont_col] = scaler.transform(test[cont_col])
    print(pd.Series(model.predict(test)).value_counts())
    pred = model.predict_proba(test)[:,1]
    submission = pd.DataFrame()
    submission["id"] = ids
    submission["stroke"] = pred
    submission.to_csv("submission.csv", index=False)

In [None]:
"""model = DecisionTreeClassifier(random_state=0)
model.fit(X, y)
submit(model)"""

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
model = LogisticRegression(max_iter=1000, random_state=0)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
submit(grid_search)