# Clasification Project

## Step 1 loading packages

In [2]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC, LinearSVC
import pandas as pd, numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

In [3]:
from utils.utils import setup_project_root
setup_project_root()

WindowsPath('C:/Users/zak/Projects/PycharmProjects/data-science')

# Step 2 - Make a sample of the data

In [4]:
df = pd.read_csv(r"data\diabetes_2.csv")

In [5]:
df.columns

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')

In [6]:
sample_df = df.sample(frac=1/3)

## Step 3 - split and scale the data

In [7]:
X = sample_df.drop('diabetes', axis=1)
y = sample_df['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Step 4 - Feature selection

In [8]:
rfe_gb = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=5, step=1, verbose=0)
rfe_rf = RFE(estimator=RandomForestClassifier(), n_features_to_select=5, step=1, verbose=0)
rfe_xtree =RFE(estimator=ExtraTreesClassifier(), n_features_to_select=5, step=1, verbose=0)
rfe_log= RFE(estimator=LogisticRegression(), n_features_to_select=5, step=1, verbose=0)
rfe_lasso = RFE(estimator=LogisticRegression(solver='liblinear', penalty='l1', max_iter=9**4),n_features_to_select=5, step=1,verbose=0)
masks= {}
for i,model in enumerate([rfe_gb,rfe_rf,rfe_xtree,rfe_log, rfe_lasso]):
    print(model)
    model.fit(X_train, y_train)
    masks[f'{i}'] = model.support_
selections = np.sum(list(masks.values()), axis=0)
meta_mask = selections >=4
X_reduced = X.loc[:,meta_mask]


RFE(estimator=GradientBoostingClassifier(), n_features_to_select=5)
RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
RFE(estimator=ExtraTreesClassifier(), n_features_to_select=5)
RFE(estimator=LogisticRegression(), n_features_to_select=5)
RFE(estimator=LogisticRegression(max_iter=6561, penalty='l1',
                                 solver='liblinear'),
    n_features_to_select=5)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Step 5 - Model the data with all x values

In [9]:
svm = SVC(gamma=1)
lasso = LogisticRegression(solver='liblinear', penalty='l1', max_iter=9**4)
log_reg = LogisticRegression()
log_ref_l2 = LogisticRegression(solver='liblinear', penalty='l2', random_state=42)
knn = KNeighborsClassifier()
lsvc = LinearSVC()
forest = RandomForestClassifier()
titles = ('SVM', 'Lasso', 'Logistic Regression Standard',
          'L2 Logistic Regression','KNN', 'Linear SVC', 'Forest')
print('Results with all X values\n')
for model, title in zip([svm,lasso, log_reg, log_ref_l2, knn,lsvc, forest], titles):
    model.fit(X_train_scaled, y_train)
    print(f'Model: {title}\nScore: {model.score(X_test_scaled, y_test)}\n\n')


Results with all X values

Model: SVM
Score: 0.65625


Model: Lasso
Score: 0.765625


Model: Logistic Regression Standard
Score: 0.765625


Model: L2 Logistic Regression
Score: 0.765625


Model: KNN
Score: 0.75


Model: Linear SVC
Score: 0.765625


Model: Forest
Score: 0.734375




## Step 6 - Model the data with reduced variables

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm = SVC()
lasso = LogisticRegression(solver='liblinear', penalty='l1', max_iter=9**4, random_state=42)
log_reg = LogisticRegression(random_state=42)
log_ref_l2 = LogisticRegression(solver='liblinear', penalty='l2', random_state=42)
knn = KNeighborsClassifier()
lsvc = LinearSVC(random_state=42)
forest = RandomForestClassifier(random_state=42)
titles = ('SVM', 'Lasso', 'Logistic Regression Standard',
          'L2 Logistic Regression','KNN', 'Linear SVC', 'Forest')
print('Results with reduced X values\n')
for model, title in zip([svm,lasso, log_reg, log_ref_l2, knn,lsvc, forest], titles):
    model.fit(X_train_scaled, y_train)
    print(f'Model: {title}\nScore: {model.score(X_test_scaled, y_test)}\n\n')
    


Results with reduced X values

Model: SVM
Score: 0.734375


Model: Lasso
Score: 0.765625


Model: Logistic Regression Standard
Score: 0.765625


Model: L2 Logistic Regression
Score: 0.765625


Model: KNN
Score: 0.703125


Model: Linear SVC
Score: 0.765625


Model: Forest
Score: 0.765625




## Cross validation


In [11]:
import warnings
warnings.filterwarnings('ignore')  # Suppress all warnings

from sklearn.model_selection import KFold
print('Pre-tune Scores')
for model, title in zip([svm,lasso, log_reg, log_ref_l2, knn,lsvc, forest], titles):
    if 1 < 2:

        model.fit(X_train_scaled, y_train)
        kf = KFold(n_splits=10, shuffle=True, random_state=1111)
        acc_cv = cross_val_score(estimator=model, X = X_train_scaled, y=y_train, cv = kf, scoring = make_scorer(accuracy_score))

        print(f'Model: {title}\nPrecision Score: {acc_cv.mean()}\n\n')

print('Tuned Scores')

model_params = {
'KNN':{ 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']},

'Forest':{
'n_estimators': [100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
'bootstrap': [True, False]
},

'SVM':{'C': [0.1, 1, 10, 100, 1000], 
'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
'kernel': ['rbf']} ,
'Linear SVC':{'C': [0.1, 1, 10, 100, 1000], 
'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
'kernel': ['rbf']} ,

'Logistic Regression Standard': 
{'penalty':['l1','l2','elasticnet',None],
'C' : np.logspace(-4,4,20),
'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
'max_iter'  : [100,1000,2500,5000]
},

'Gradient Boosting':{
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
}

            }
svm = SVC()
log_reg = LogisticRegression()
knn = KNeighborsClassifier()
gb=GradientBoostingClassifier()
for model, title in zip([svm,log_reg, knn, gb], ['SVM', 'Logistic Regression Standard', 'KNN', 'Gradient Boosting']):
    model.fit(X_train_scaled, y_train)
    """
    kf = KFold(n_splits=10, shuffle=True, random_state=1111)
    acc_cv = cross_val_score(estimator=model, X = X_train_scaled, y=y_train, cv = kf, scoring = make_scorer(accuracy_score))
    """
    
    best_model = GridSearchCV(model,param_grid = model_params[title], cv = 3, verbose=False,n_jobs=-1)
    best_model.fit(X_train_scaled,y_train)
    
    acc_cv = cross_val_score(estimator=best_model, X = X_train_scaled, y=y_train, cv = kf, scoring = make_scorer(accuracy_score))
    print(best_model.score(X_test_scaled, y_test))
    print(f'Model: {title}\nAccuracy Score: {acc_cv.mean()}\n\n')
    

Pre-tune Scores
Model: SVM
Precision Score: 0.7226315789473684


Model: Lasso
Precision Score: 0.7694736842105263


Model: Logistic Regression Standard
Precision Score: 0.7694736842105263


Model: L2 Logistic Regression
Precision Score: 0.7694736842105263


Model: KNN
Precision Score: 0.7484210526315789


Model: Linear SVC
Precision Score: 0.7694736842105263


Model: Forest
Precision Score: 0.7326315789473684


Tuned Scores
0.765625
Model: SVM
Accuracy Score: 0.743421052631579


0.765625
Model: Logistic Regression Standard
Accuracy Score: 0.7694736842105263


0.734375
Model: KNN
Accuracy Score: 0.7121052631578948


0.734375
Model: Gradient Boosting
Accuracy Score: 0.7278947368421053




In [12]:
best_model

0,1,2
,estimator,GradientBoostingClassifier()
,param_grid,"{'learning_rate': [0.01, 0.1, ...], 'max_depth': [3, 5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,False
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,learning_rate,0.01
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [13]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(log_model,param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)
clf

NameError: name 'log_model' is not defined

## Tuning

In [33]:


param_dist = {"max_depth": [2, 4, 6, 8],
              "max_features": [2, 4, 6, 8, 10],
              "min_samples_split": [2, 4, 8, 16]}

# Use KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1111)
tuning_dictionary = {''}

In [None]:
X