# Create the dataset

In [1]:
import numpy as np
import pandas as pd

# Preprocessing data
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import zscore

# Creating model
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
# Transformations
from sklearn.preprocessing import StandardScaler
# Models
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

# Other
import datetime

## Import data

In [2]:
start_data = pd.read_csv('./datos_train_test_sh.csv',delimiter=',',decimal='.')
validate_data = pd.read_csv('./nuevas_instancias_a_predecir.csv',delimiter=';',decimal='.')

start_data.head()

Unnamed: 0,id,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,clase
0,0,154,4.5,4.75,23.52,Present,43,25.76,0.0,53,1
1,1,124,1.04,2.84,16.42,Present,46,20.17,0.0,61,0
2,2,148,12.2,3.79,34.15,Absent,57,26.38,14.4,57,1
3,3,110,4.64,4.55,30.46,Absent,48,30.9,15.22,46,0
4,4,164,0.5,6.95,39.64,Present,47,41.76,3.81,46,1


## Pre-process data

### Turn categorical columns to dummy variables

Famhist is proved to be the only categorical datapoint

In [3]:
def addDummyVariable (data, col):
  ohe = OneHotEncoder()
  # Create an array of binary tupples that contain a zero and a one 
  dummy_array = ohe.fit_transform(data[col].values.reshape(-1, 1)).toarray()
  # Create a dataframe from the array
  labels = col + "_" + data[col].unique()
  dummy_df = pd.DataFrame(dummy_array,columns=labels)
  # Add dummy colums to dataset
  df = pd.concat([data,dummy_df],axis=1)
  df = df.drop(col,axis=1)
  return df

start_data = addDummyVariable(start_data, 'famhist')
validate_data = addDummyVariable(validate_data, 'famhist')

# Sort columns
start_data = start_data.reindex(columns=['id','sbp','tobacco','ldl','adiposity','typea','famhist_Absent','famhist_Present','obesity','alcohol','age','clase'])
validate_data = validate_data.reindex(columns=['id','sbp','tobacco','ldl','adiposity','typea','famhist_Absent','famhist_Present','obesity','alcohol','age'])

## Depuración

### Eliminación de valores extremos

In [4]:
start_data = start_data[start_data['sbp'] < 190]

In [5]:
def changeToZScore(df):
  df['sbp_z_score'] = zscore(df['sbp'])
  df = df.drop('sbp', axis=1)
  df['tobacco_z_score'] = zscore(df['tobacco'])
  df = df.drop('tobacco', axis=1)
  df['ldl_z_score'] = zscore(df['ldl'])
  df = df.drop('ldl', axis=1)
  df['obesity_z_score'] = zscore(df['obesity'])
  df = df.drop('obesity', axis=1)
  df['alcohol_z_score'] = zscore(df['alcohol'])
  df = df.drop('alcohol', axis=1)
  return df

start_data = changeToZScore(start_data) 
validate_data = changeToZScore(validate_data) 

validate_data


Unnamed: 0,id,adiposity,typea,famhist_Absent,famhist_Present,age,sbp_z_score,tobacco_z_score,ldl_z_score,obesity_z_score,alcohol_z_score
0,1,36.57,57,1.0,0.0,49,-0.259747,-0.842299,-0.303587,1.252206,0.018229
1,2,16.64,42,0.0,1.0,20,-0.664865,-0.380426,-0.499181,-0.964368,-0.264588
2,3,27.68,48,0.0,1.0,26,3.082475,-0.708869,3.208014,0.665544,2.315034
3,4,26.48,48,1.0,0.0,27,-0.462306,-0.267524,-0.913114,-0.513089,1.126148
4,5,21.36,61,1.0,0.0,31,-0.462306,-0.175149,0.046663,-1.219207,-0.659628
...,...,...,...,...,...,...,...,...,...,...,...
111,112,37.83,63,0.0,1.0,64,0.753048,2.283043,-0.362720,2.303420,-0.502591
112,113,26.08,47,1.0,0.0,49,-0.158467,0.722938,-0.030665,-0.550253,-0.620840
113,114,13.00,50,1.0,0.0,16,-1.829579,-0.719133,1.101963,-1.612085,-0.464179
114,115,20.41,58,0.0,1.0,39,-0.158467,-0.560043,-0.590155,-0.359123,0.843331


# Get training and testing sets

In [6]:
X = start_data.drop("clase", axis = 1)
y = start_data[["clase"]]

# Balance classes
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create model

## Transformations set



In [7]:
standardScaler = StandardScaler()

smote = SMOTE(random_state=42)

## Algorithms set

In [8]:
decisionTreeClassifier = tree.DecisionTreeClassifier()

logisticRegression = LogisticRegression(random_state=0)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

logisticRegression_A = LogisticRegression(random_state=0)

models = [
  decisionTreeClassifier,
  logisticRegression,
  rf_classifier
]

## Create pipeline and fit model

In [9]:
pipe = Pipeline(
    steps=[
      ("scaler", standardScaler),
      # ("smote", smote),
      # ("tree", decisionTreeClassifier),
      # ("random forest", rf_classifier)
      ("logistic", logisticRegression)
    ]
)

pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


## Create prediction 

In [10]:
prediction = pipe.predict(X_test)
prediction

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1], dtype=int64)

## Metrics

In [11]:
print("accuracy:", accuracy_score(prediction, y_test))
print("roc auc: ", roc_auc_score(y_test, prediction))

# Display classification report
print("Classification Report:")
print(classification_report(y_test, prediction))


accuracy: 0.7387387387387387
roc auc:  0.7386141834743006
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.75        58
           1       0.72      0.74      0.73        53

    accuracy                           0.74       111
   macro avg       0.74      0.74      0.74       111
weighted avg       0.74      0.74      0.74       111



# Compare models

In [12]:
pipes = [
  rf_classifier,
  logisticRegression,
  decisionTreeClassifier,
]#

for pipe in pipes:
  pipe.fit(X_train, y_train)

  print("accuracy:", accuracy_score(prediction, y_test))
  print("roc auc: ", roc_auc_score(y_test, prediction))

  return fit_method(estimator, *args, **kwargs)


accuracy: 0.7387387387387387
roc auc:  0.7386141834743006
accuracy: 0.7387387387387387
roc auc:  0.7386141834743006
accuracy: 0.7387387387387387
roc auc:  0.7386141834743006


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Test validation set

In [13]:
pipe.predict(validate_data)

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1], dtype=int64)

## Get submit csv

In [14]:
validate_prediction = pipe.predict(validate_data)

date = datetime.datetime.now().strftime("%I-%M%p-%B-%d-%Y")
filename = 'prediction-results/logistic-regression-smote-' + date + '.csv'

prediction_df = pd.DataFrame(validate_prediction)
prediction_df.index = prediction_df.index + 1
prediction_df.columns = ['Predicted']
prediction_df.index.names = ['id']
prediction_df.to_csv(filename,sep=',')