In [None]:
# -*- coding: utf-8 -*-
"""Assignment 1 - ML&DM

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1TYATD6rqNwHFUUp48g1C08Z_fxje0c32
"""

import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

dataPath = sys.argv[1]
classifier = sys.argv[2]

if len(sys.argv) > 3:
  configPath = sys.argv[3]
  config = pd.read_csv(configPath)

df1 = pd.read_csv(dataPath)


#classifier = 'NN'
#config = pd.read_csv('/content/drive/My Drive/ML/config.csv')
#df1 = pd.read_csv('/content/drive/My Drive/ML/breast-cancer-wisconsin.csv')

#df2 = pd.read_csv('/content/drive/My Drive/ML/breast-cancer-wisconsin-normalised.csv')

def getAccScore(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

def stratKFold(model, X, y):
  kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores =[]

  for train_index,test_index in kf.split(X, y):
    xtr,xte = X.iloc[train_index],X.iloc[test_index]
    ytr,yte = y.iloc[train_index],y.iloc[test_index]
    
    score = getAccScore(model, xtr, xte, ytr, yte)
    scores.append(score)
  
  return scores

def kNNClassifier(X, y, k=1):
  knn = KNeighborsClassifier(n_neighbors=k)
  scores = stratKFold(knn, X, y)
  return scores, sum(scores)/len(scores)

def logregClassifier(X, y):
  logreg = LogisticRegression(random_state=0)
  scores = stratKFold(logreg, X, y)
  return scores, sum(scores)/len(scores)

def nbClassifier(X, y):
  nb = GaussianNB()
  scores = stratKFold(nb, X, y)
  return scores, sum(scores)/len(scores)

def dtClassifier(X, y):
  tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
  scores = stratKFold(tree, X, y)
  return scores, sum(scores)/len(scores)

def bagDTClassifier(X, y, n_estimators=10, max_samples=10, max_depth=10):
  bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=0, max_depth=max_depth, criterion = 'entropy'), n_estimators=n_estimators, max_samples=max_samples, bootstrap=True, random_state=0)
  scores = stratKFold(bag_clf, X, y)
  return scores, sum(scores)/len(scores)

def adaDTClassifier(X, y, n_estimators=10, learning_rate=0.01, max_depth=10):
  ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(random_state=0, max_depth=max_depth, criterion = 'entropy'), n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
  scores = stratKFold(ada_clf, X, y)
  return scores, sum(scores)/len(scores)

def gbClassifier(X, y, n_estimators=10, learning_rate=0.01):
  gb_clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
  scores = stratKFold(gb_clf, X, y)
  return scores, sum(scores)/len(scores)



def bestLinClassifier(X,y):
  X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

  param_grid = [{'C': [0.001, 0.01, 0.1, 1, 10, 100],'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}]

  #skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  grid = GridSearchCV(SVC(kernel ='linear'),param_grid,refit=True,cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0), return_train_score=True)
  grid.fit(X_train, y_train)
  print(grid.best_params_.get('C'))
  print(grid.best_params_.get('gamma'))

  #scores = cross_val_score(grid.best_estimator_, X, y, cv = skf) 

  #scores =[]

  #for train_index,test_index in skf.split(X, y):
  #  xtr,xte = X.iloc[train_index],X.iloc[test_index]
  #  ytr,yte = y.iloc[train_index],y.iloc[test_index]
    
  #  score = getAccScore(grid.best_estimator_, xtr, xte, ytr, yte)
  #  scores.append(score)


  #mean_score = sum(scores)/len(scores)
  #print('%.4f' % mean_score.round(4))

  #y_pred_rf = grid.best_estimator_.predict(X_test)
  #scoretwo = accuracy_score(y_test, y_pred_rf)
  #print('%.4f' % scoretwo.round(4))

  print('%.4f' % grid.best_score_)
  print('%.4f' % grid.score(X_test, y_test))




def bestRFClassifier(X,y):
  X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

  param_grid = [{'n_estimators': [10, 20, 50, 100], 
                 'max_features': ['auto', 'sqrt', 'log2'],
                 'max_leaf_nodes': [10, 20 ,30]}]

  #skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  grid = GridSearchCV(RandomForestClassifier(criterion='entropy', random_state = 0),param_grid,cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0), return_train_score=True)
  grid.fit(X_train, y_train)

  print(grid.best_params_.get('n_estimators'))
  print(grid.best_params_.get('max_features'))
  print(grid.best_params_.get('max_leaf_nodes'))

  #scores = cross_val_score(grid.best_estimator_, X, y, cv = skf) 
  #scores =[]

  #for train_index,test_index in skf.split(X, y):
  #  xtr,xte = X.iloc[train_index],X.iloc[test_index]
  #  ytr,yte = y.iloc[train_index],y.iloc[test_index]
    
  #  score = getAccScore(grid.best_estimator_, xtr, xte, ytr, yte)
  #  scores.append(score)

  #mean_score = sum(scores)/len(scores)
  #print('%.4f' % mean_score.round(4))
  
  print('%.4f' % grid.best_score_)
  print('%.4f' % grid.score(X_test, y_test))

  #y_pred_rf = grid.best_estimator_.predict(X_test)
  #scoretwo = accuracy_score(y_test, y_pred_rf)
  #print('%.4f' % scoretwo.round(4))




def pre_process(df):
  df['class'] = df['class'].replace('class1','0')
  df['class'] = df['class'].replace('class2','1')


  df.replace('?',np.NaN,inplace=True)
  imp=SimpleImputer(missing_values=np.NaN)
  df_transformed=pd.DataFrame(imp.fit_transform(df))
  df_transformed.columns=df.columns
  df_transformed.index=df.index
  df1 = df_transformed.copy()
  scaler = MinMaxScaler()
  scaled = scaler.fit_transform(df1)
  df1 = pd.DataFrame(scaled, index=df1.index, columns=df1.columns) 
  df1 = df1.round(4)
  
  X = df1.iloc[:, df1.columns != 'class']
  y = df1.iloc[:,-1]
  
  inde_x = 0
  for index, row in X.iterrows():
    for (columnName, columnData) in X.iteritems():
      print("%.4f" % row[columnName], end = '')
      print(',', end = '')
    print("%.0f" % y.iloc[inde_x])
    inde_x = inde_x + 1

  print('', end='')
  return df1

def split_data(df1):
  X = df1.iloc[:, df1.columns != 'class']
  y = df1.iloc[:,-1]

  X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)
  
  return X, y, X_train, X_test, y_train, y_test

mean_score = None
to_round = 0
#X, y, X_train, X_test, y_train, y_test = split_data(df1)

X = df1.iloc[:, df1.columns != 'class']
y = df1.iloc[:,-1]

if classifier =='NN':
  if len(sys.argv) > 3:
    scores, mean_score = kNNClassifier(X, y, config['K'].iloc[0])
  else:
    scores, mean_score = kNNClassifier(X, y)

elif classifier == 'LR':
   scores, mean_score = logregClassifier(X, y)

elif classifier == 'NB':
   scores, mean_score = nbClassifier(X, y)

elif classifier == 'DT':
   scores, mean_score = dtClassifier(X, y)

elif classifier == 'BAG':
  if len(sys.argv) > 3:
    scores, mean_score = bagDTClassifier(X, y, config['n_estimators'].iloc[0], config['max_samples'].iloc[0], config['max_depth'].iloc[0])
  else:
    scores, mean_score = bagDTClassifier(X, y)

elif classifier == 'ADA':
  if len(sys.argv) > 3:
    scores, mean_score = adaDTClassifier(X, y, config['n_estimators'].iloc[0], config['learning_rate'].iloc[0], config['max_depth'].iloc[0])
  else:
    scores, mean_score = adaDTClassifier(X, y)

elif classifier == 'GB':
  if len(sys.argv) > 3:
    scores, mean_score = gbClassifier(X, y, config['n_estimators'].iloc[0], config['learning_rate'].iloc[0])
  else:
    scores, mean_score = gbClassifier(X, y)

elif classifier == 'RF':
   bestRFClassifier(X, y)

elif classifier == 'SVM':
   bestLinClassifier(X, y)

elif classifier == 'P':
   df = pre_process(df1)

else:
   print('Enter a valid classifier')
   to_round = 1

if to_round == 0 and classifier != 'P' and classifier != 'RF' and classifier != 'SVM' and classifier != 'RBF':
  print("%.4f" % mean_score.round(4))

20
auto
30
0.9628
0.9943
