#HW2 - Modeling and Classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

## Loading the dataset

In [2]:
train_raw = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW2/main/train_modify_file.csv')
validation_raw = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW2/main/validation_modify_file%20.csv')
test_raw = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW2/main/train_modify_file.csv')
transformation_helper = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW2/main/transformation_helper%20.csv', index_col=0)
virus_test_raw = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW2/main/virus_test.csv')
virus_hw1_file_raw = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW2/main/virus_hw1.csv')  # used only for debug

## Setting the data

In [3]:
targets_col = ['Virus','SpreadLevel', 'Risk']
illness_att = ['New_loss_of_taste_or_smell', 'Shortness_of_breath']
tmp = ['AgeGroup', 'BMI', 'ConversatiosPerDay', 'DisciplineScore', 'HappinessScore', 'HouseholdExpenseOnPresents', 'HouseholdExpenseOnSocialGames',
        'HouseholdExpenseParkingTicketsPerYear', 'MedicalCarePerYear', 'PCR_10',
        'PCR_17', 'PCR_19', 'PCR_7', 'PCR_72', 'PCR_76', 'PCR_8', 'PCR_89', 'PCR_9','PCR_93', 'PCR_95',
        'StudingPerDay', 'Self_declaration_of_Illness_Form', 'SocialMediaPerDay']

original_col = tmp.copy()
original_col.extend(targets_col)

original_col_with_id_without_targets = ['ID']
original_col_with_id_without_targets.extend(tmp)

att_col = tmp.copy()
att_col.remove('Self_declaration_of_Illness_Form')
for illness in illness_att:
  att_col.append('Is_having_'+illness)
att_col.sort()

all_cols = att_col.copy()
all_cols.extend(targets_col)


In [4]:
def modify_Self_declaration_of_Illness_Form(ds):
  for illness in illness_att:
    tmp_col = ds.Self_declaration_of_Illness_Form.apply(lambda row: 0 if type(row) != str or illness not in row  else 0.1)
    ds.insert(loc=ds.columns.get_loc('Self_declaration_of_Illness_Form')+1, column="Is_having_" + illness, value=tmp_col.astype(float))
  ds.drop('Self_declaration_of_Illness_Form', axis='columns', inplace=True)


**Data Transformation for unseen data**

In [5]:
def filling_missing_data(dataset):
  for col_name in ['BMI', 'ConversatiosPerDay', 'DisciplineScore', 'HouseholdExpenseOnPresents','PCR_7', 'PCR_72', 'PCR_89']:
    dataset[col_name] = dataset[col_name].fillna(transformation_helper.loc[col_name, 'mean'])

  for col_name in ['HappinessScore', 'HouseholdExpenseOnSocialGames','MedicalCarePerYear','PCR_19', 'PCR_95']:
    dataset[col_name] = dataset[col_name].fillna(transformation_helper.loc[col_name, 'median'])

  return dataset

In [6]:
def handle_outliers(dataset):
  col_list = original_col_with_id_without_targets.copy()
  col_list.remove("Self_declaration_of_Illness_Form")
  col_list.remove("ID")
  for col in col_list:
    fence_low  = transformation_helper.loc[col, 'low outlier']
    fence_high = transformation_helper.loc[col, 'high outlier']
    m = transformation_helper.loc[col, 'median']
    dataset[col] = dataset[col].apply(lambda row: m if (pd.isnull(row) or row < fence_low or row > fence_high) else row)
  return dataset

In [7]:
def normalize_all_data(dataset):
  col_list = original_col_with_id_without_targets.copy()
  col_list.remove("Self_declaration_of_Illness_Form")
  col_list.remove("ID")
  for col in col_list:
    min = transformation_helper.loc[col, 'min']
    max = transformation_helper.loc[col, 'max']
    dataset[col] = dataset[col].apply(lambda row: (row-min)/(max-min))
  return dataset

In [8]:
def data_transformation(dataset):
  dataset = dataset[original_col_with_id_without_targets]
  modify_Self_declaration_of_Illness_Form(dataset)
  filling_missing_data(dataset)
  handle_outliers(dataset)
  normalize_all_data(dataset)
  return dataset.dropna()

In [9]:
def initial_set_up(ds):
  modify_Self_declaration_of_Illness_Form(ds)
  ds = ds[all_cols].dropna()
  return ds

In [10]:
train = initial_set_up(train_raw.copy()[original_col])
validation = initial_set_up(validation_raw.copy()[original_col])
test = initial_set_up(test_raw.copy()[original_col])

## Part 3 – Classification

In [11]:
best_hyperparameter_train = pd.DataFrame(columns = ['Virus_param', 'Virus_accurcy', 'SpreadLevel_param', 'SpreadLevel_accurcy', 'Risk_param', 'Risk_accurcy', 'best_param', 'best_param_accurcy'])
best_hyperparameter_train.loc['knn'] = [0, 0, 0, 0, 0, 0, 0, 0]
best_hyperparameter_train.loc['decision tree'] = [0, 0, 0, 0, 0, 0, 0, 0]
best_hyperparameter_train.loc['svm'] = [0, 0, 0, 0, 0, 0, 0, 0]

In [12]:
best_hyperparameter = pd.DataFrame(columns = ['Virus_param', 'Virus_accurcy', 'SpreadLevel_param', 'SpreadLevel_accurcy', 'Risk_param', 'Risk_accurcy', 'best_param', 'best_param_accurcy'])
best_hyperparameter.loc['knn'] = [0, 0, 0, 0, 0, 0, 0, 0]
best_hyperparameter.loc['decision tree'] = [0, 0, 0, 0, 0, 0, 0, 0]
best_hyperparameter.loc['svm'] = [0, 0, 0, 0, 0, 0, 0, 0]
# best_hyperparameter

In [13]:
train_examples = train[att_col]
train_targets = train[targets_col]

validation_examples = validation[att_col]
validation_targets = validation[targets_col]

test_examples = test[att_col]
test_targets = test[targets_col]

def do_nothing(*args_list):
  return

def calc_basic_hist(name, target, param, res, best_params):
  if best_params.loc[name][target + '_accurcy'] < res:
    best_params.loc[name][target + '_accurcy'] = res
    best_params.loc[name][target + '_param'] = param

def run_basic_classification(name, classifer_contractor, param_list, scoring_set, scoring_set_targets, best_params, scoring_name, print_flag=True, extra_func=do_nothing):
  graphs_data = {
    'Virus': [],
    'SpreadLevel': [],
    'Risk': []
  }

  best_acc = 0
  best_param = 0
  for param in param_list:
    acc = 0.0
    for target in targets_col:
      classifer = classifer_contractor(param)
      classifer = classifer.fit(train_examples, train_targets[target])
      res = classifer.score(scoring_set, scoring_set_targets[target])
      acc += res
      graphs_data[target].append(res)
      extra_func(name, target, param, res, best_params)
    if best_acc < acc/3:
      best_acc = acc/3
      best_param = param
  if extra_func == calc_basic_hist and best_params.loc[name]['best_param_accurcy'] < best_acc:
    best_params.loc[name]['best_param_accurcy']  = best_acc
    best_params.loc[name]['best_param']  = best_param
  if print_flag:
    x = param_list
    [a,b,c] = plt.plot(x, graphs_data['Virus'], '-r', x, graphs_data['SpreadLevel'], '-b', x, graphs_data['Risk'], '-y')
    plt.grid()
    plt.legend([a,b,c], ["Virus","SpreadLevel","Risk"], loc=1)
    plt.title(name + ' classification results on ' + scoring_name)
    plt.xlabel("hyperparameter")
    plt.ylabel("accuracy score")
    plt.show()
  return best_acc

# KNN


**Q2 - accuracies of KNN as function of k param**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

run_basic_classification('knn', KNeighborsClassifier, range(1,101), train_examples, train_targets, best_hyperparameter_train ,"train", print_flag=True, extra_func=calc_basic_hist)
run_basic_classification('knn', KNeighborsClassifier, range(1,101), validation_examples, validation_targets, best_hyperparameter,"validation", print_flag=True, extra_func=calc_basic_hist)

# Decsion Tree

**Q6 - accuracies of Decsion Tree as function of t param**

In [None]:
from sklearn import tree

def build_DecisionTreeClassifier(param):
  return tree.DecisionTreeClassifier(criterion='entropy', max_depth=param)

run_basic_classification('decision tree', build_DecisionTreeClassifier, range(2,101), train_examples, train_targets, best_hyperparameter_train, "train", print_flag=True, extra_func=calc_basic_hist)
run_basic_classification('decision tree', build_DecisionTreeClassifier, range(2,101), validation_examples, validation_targets, best_hyperparameter,"validation", print_flag=True, extra_func=calc_basic_hist)

**Q8 - confusion matrix**

In [None]:
from sklearn.metrics import plot_confusion_matrix
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=13)
model.fit(train_examples, train_targets['Virus'])
plot_confusion_matrix(model, validation_examples, validation_targets['Virus'])
plt.show()

**Q9 - maximal depth = 7**

In [None]:
from sklearn.tree import plot_tree
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=7)
model.fit(train_examples, train_targets['Risk'])
fig = plt.figure(figsize=(100, 40))  # large figsize leaves enough room for eveything (no overlap)
plot_tree(model,filled=True, feature_names=att_col, class_names = ['low', 'high', 'medium'], fontsize=14)  # set fontsize to be large so that it is readable later.
plt.show()

In [None]:
print(train_examples.columns)
print(train_examples.columns[6])
print(train_examples.columns[10])

**Q10**

In [19]:
def show_decision_regions(X, y, pair,param):
  classifer = build_DecisionTreeClassifier(best_hyperparameter[param]['decision tree']) 

  classifer.fit(X, y)

  x_min, x_max = X[pair[0]].min() - 1, X[pair[0]].max() + 1
  y_min, y_max = X[pair[1]].min() - 1, X[pair[1]].max() + 1
  xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                      np.arange(y_min, y_max, 0.1))

  f, axarr = plt.subplots(1, 1, sharex='col', sharey='row', figsize=(10, 8))
  Z = classifer.predict(np.c_[xx.ravel(), yy.ravel()])
  Z = Z.reshape(xx.shape)

  axarr.contourf(xx, yy, Z, alpha=0.4)
  axarr.scatter(X[pair[0]], X[pair[1]], c=y,
                                  s=20, edgecolor='k')
  axarr.set_title('Decision Tree')
  plt.show()

In [None]:
pair = [train_examples.columns[6], train_examples.columns[10]]
dic_tmp = {'low': 1, 'medium': 2, 'high': 3}

y_train_risk = train_targets['Risk'].apply(lambda row : dic_tmp[row])
y_validation_risk = validation_targets['Risk'].apply(lambda row : dic_tmp[row])

show_decision_regions(train_examples[pair], y_train_risk, pair, 'Risk_param')
show_decision_regions(validation_examples[pair], y_validation_risk, pair, 'Risk_param')


**Q11 - ROC curve**

In [None]:
from sklearn import metrics

knn = KNeighborsClassifier(best_hyperparameter['SpreadLevel_param']['knn'])
knn.fit(train_examples, train_targets['SpreadLevel'].apply(lambda x : 1 if x=='high' else -1))
metrics.plot_roc_curve(knn, validation_examples, validation_targets['SpreadLevel'].apply(lambda x : 1 if x=='high' else -1))

dt = build_DecisionTreeClassifier(best_hyperparameter['SpreadLevel_param']['decision tree'])
dt.fit(train_examples, train_targets['SpreadLevel'].apply(lambda x : 1 if x=='high' else -1))
metrics.plot_roc_curve(dt, validation_examples, validation_targets['SpreadLevel'].apply(lambda x : 1 if x=='high' else -1))

# SVM


**Q13 - accuracies of SVM as a function of C param**

In [None]:
from sklearn.svm import LinearSVC

def build_LinearSVC(param):
  return LinearSVC(random_state=0, C=param) 

run_basic_classification('svm', build_LinearSVC, np.logspace(0, 3, num=75), train_examples, train_targets, best_hyperparameter_train, "train", print_flag=True, extra_func=calc_basic_hist)
run_basic_classification('svm', build_LinearSVC, np.logspace(0, 3, num=75), validation_examples, validation_targets, best_hyperparameter,"validation", print_flag=True, extra_func=calc_basic_hist)

In [None]:
svm_param = []
for col in targets_col:
  for i in np.linspace(max(best_hyperparameter.loc['svm'][col + "_param"] - 25, 0.001), best_hyperparameter.loc['svm'][col + "_param"] + 25, 75):
    svm_param.append(i)

run_basic_classification('svm', build_LinearSVC, sorted(svm_param), train_examples, train_targets, best_hyperparameter_train, "train", print_flag=True, extra_func=calc_basic_hist)
run_basic_classification('svm', build_LinearSVC, sorted(svm_param), validation_examples, validation_targets, best_hyperparameter, "validation", print_flag=True, extra_func=calc_basic_hist)


In [None]:
best_hyperparameter_train

In [None]:
best_hyperparameter

**Q15**

In [26]:
clf = build_LinearSVC(best_hyperparameter['Virus_param']['svm'])

In [None]:
clf.fit(train_examples, train_targets.Virus)
print('w = ',clf.coef_)
print(clf.classes_)
print(clf.coef_[3])
pd.Series(clf.coef_[3], index=att_col).nlargest(10).plot(kind='barh')

**Q 17**

In [None]:
test_basic_results = best_hyperparameter.copy()
test_basic_results.drop(['best_param', 'best_param_accurcy'], axis='columns', inplace=True)

test_examples = test[att_col]
test_targets = test[targets_col]

def fill_test_basic_results(name, classifer_contractor):
  for target in targets_col:
    classifer = classifer_contractor(best_hyperparameter.loc[name][target + '_param'])
    classifer = classifer.fit(train_examples, train_targets[target])
    test_basic_results.loc[name][target + '_accurcy'] = classifer.score(test_examples, test_targets[target])

fill_test_basic_results('knn', KNeighborsClassifier)
fill_test_basic_results('decision tree', build_DecisionTreeClassifier)
fill_test_basic_results('svm', build_LinearSVC)

test_basic_results

## Part 5 – Non-Linear SVM

In [29]:
train_p5 = train[ train.Virus.isin(['covid', 'cmv'])][['PCR_7', 'PCR_72', 'PCR_89', 'Virus']].copy()
train_p5.Virus = train_p5.Virus == 'cmv'  # Virus is actually Is_cmv_or_covid

In [30]:
validation_p5 = validation[ validation.Virus.isin(['covid', 'cmv'])][['PCR_7', 'PCR_72', 'PCR_89', 'Virus']].copy()
validation_p5.Virus = validation_p5.Virus == 'cmv'  # Virus is actually Is_cmv_or_covid

**Q20**

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

c = train_p5.Virus

img = ax.scatter(train_p5.PCR_7, train_p5.PCR_72, train_p5.PCR_89, c=train_p5.Virus, cmap='RdBu')
ax.set_xlabel('PCR_7')
ax.set_ylabel('PCR_72')
ax.set_zlabel('PCR_89')
plt.title('covid \ cmv (PCR: 7, 72, 89)')
# ax.view_init(90,0)
# ax.view_init(90,0)
fig = plt.gcf()
plt.show()

In [None]:
ax.view_init(90,0)
fig

In [None]:
ax.view_init(0,90)
fig

In [None]:
ax.view_init(45,45)
fig

In [None]:
ax.view_init(120,30)
fig

In [None]:
from sklearn.svm import SVC
kernels_test = pd.DataFrame(columns = ['kernel', 'C / C+degree', 'accurcy'])
for ker in ['rbf', 'sigmoid']:
  for c in np.linspace(0.05, 15, 150):
    classifer = SVC(kernel=ker, C=c)
    classifer.fit(train_p5[train_p5.columns[:-1]], train_p5.Virus)
    res = classifer.score(validation_p5[validation_p5.columns[:-1]], validation_p5.Virus)
    kernels_test = kernels_test.append({'kernel' : ker, 'C / C+degree': c, 'accurcy':res}, ignore_index=True)

for deg in range(1,4):  
    for c in np.linspace(0.05, 15, 50):
      classifer = SVC(kernel='poly', degree=deg, C=c)
      classifer.fit(train_p5[train_p5.columns[:-1]], train_p5.Virus)
      res = classifer.score(validation_p5[validation_p5.columns[:-1]], validation_p5.Virus)
      kernels_test = kernels_test.append({'kernel' : 'poly','C / C+degree': str(c) +"+"+str(deg), 'accurcy':res}, ignore_index=True)

kernels_test.sort_values(['accurcy', 'C / C+degree'] , ascending=[False, True], inplace=True)
kernels_test.head(10)  # best params

In [None]:
kernels_test

In [None]:
only_rbf = kernels_test[kernels_test['kernel'] == 'rbf']

only_rbf.sort_values('C / C+degree',ascending=False, inplace=True)
only_rbf

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(only_rbf)

**Q22**

rbf kernel is the best.
create a plot that shows the connection between hyperparameters to accurcy




In [None]:
graphs_data = []

for c in np.linspace(0.05, 15, 150):
  classifer = SVC(kernel='rbf', C=c)
  classifer.fit(train_p5[train_p5.columns[:-1]], train_p5.Virus)
  res = classifer.score(train_p5[validation_p5.columns[:-1]], train_p5.Virus)
  graphs_data.append(res)
x = np.linspace(0.05, 15, 150)
x = x.tolist()
plt.plot(x, graphs_data, '-y')
plt.grid()
# plt.legend([a,b,c], [1,2,3], loc=1)
plt.title('classification results on train')
plt.xlabel("hyperparameter (c)")
plt.ylabel("accuracy score")
plt.show()

In [None]:
graphs_data = []

for c in np.linspace(0.05, 15, 150):
  classifer = SVC(kernel='rbf', C=c)
  classifer.fit(train_p5[train_p5.columns[:-1]], train_p5.Virus)
  res = classifer.score(validation_p5[validation_p5.columns[:-1]], validation_p5.Virus)
  graphs_data.append(res)
x = np.linspace(0.05, 15, 150)
x = x.tolist()
plt.plot(x, graphs_data, '-b')
plt.grid()
# plt.legend([a,b,c], [1,2,3], loc=1)
plt.title('classification results on validation')
plt.xlabel("hyperparameter (c)")
plt.ylabel("accuracy score")
plt.show()

In [41]:
best_rbf = pd.DataFrame(columns = ['Virus_accurcy','SpreadLevel_accurcy', 'Risk_accurcy'])
best_rbf.loc['rbf'] = [0, 0, 0]

**Q23**

In [None]:
classifer = SVC(kernel='rbf', C=kernels_test.iloc[0]['C / C+degree'])

classifer.fit(train_examples, train_targets.Virus)
best_rbf['Virus_accurcy'] = classifer.score(train_examples, train_targets.Virus)

classifer.fit(train_examples, train_targets.SpreadLevel)
best_rbf['SpreadLevel_accurcy'] = classifer.score(train_examples, train_targets.SpreadLevel)

classifer.fit(train_examples, train_targets.Risk)
best_rbf['Risk_accurcy'] = classifer.score(train_examples, train_targets.Risk)

best_rbf

In [None]:
classifer = SVC(kernel='rbf',  C=kernels_test.iloc[0]['C / C+degree'])

classifer.fit(train_examples, train_targets.Virus)
best_rbf['Virus_accurcy'] = classifer.score(validation_examples, validation_targets.Virus)

classifer.fit(train_examples, train_targets.SpreadLevel)
best_rbf['SpreadLevel_accurcy'] = classifer.score(validation_examples, validation_targets.SpreadLevel)

classifer.fit(train_examples, train_targets.Risk)
best_rbf['Risk_accurcy'] = classifer.score(validation_examples, validation_targets.Risk)

best_rbf

# **Experimenting**

In [None]:
all_examples_data = pd.concat([train, validation, test], ignore_index=True)
all_examples_data

In [45]:
def run_general_classification(name, classifer_contractor, param_list, df_train, df_test, atts, targets, more_options=True, extra_func=do_nothing):
  # print(atts)
  for target in targets:
    for param in param_list:
      classifer = classifer_contractor(param)
      classifer = classifer.fit(df_train[atts], df_train[target])
      res = classifer.score(df_test[atts], df_test[target])
      extra_func(name, target, param, res)

In [46]:
best_hyperparameter2 = pd.DataFrame(columns = ['Virus_param', 'Virus_accurcy', 'SpreadLevel_param', 'SpreadLevel_accurcy', 'Risk_param', 'Risk_accurcy'])

def create_best_hyperparameter2_hist():
  for tmp in ['knn', 'decision tree', 'svm']:
    for i in range(1, 6):
      best_hyperparameter2.loc[tmp+"("+str(i)+")"] = [0, 0, 0, 0, 0, 0]
create_best_hyperparameter2_hist()

In [47]:
def hist_classification_results(name, target, param, res):
  for i in range(1,6):
    if best_hyperparameter2.loc[name+"("+str(i)+")"][target+"_accurcy"] < res:
      for j in range(5,i, -1):
        best_hyperparameter2.loc[name+"("+str(j)+")"][target+"_param"] = best_hyperparameter2.loc[name+"("+str(j-1)+")"][target+"_param"]
        best_hyperparameter2.loc[name+"("+str(j)+")"][target+"_accurcy"] = best_hyperparameter2.loc[name+"("+str(j-1)+")"][target+"_accurcy"]
      best_hyperparameter2.loc[name+"("+str(i)+")"][target+"_param"] = param
      best_hyperparameter2.loc[name+"("+str(i)+")"][target+"_accurcy"] = res
      return

In [48]:
kfold_tmp = {}

def kfold_fill_hist_results(name, target, param, res):
  key = str(name) + "-" + str(target) + "-" + str(param)
  if key not in kfold_tmp:
    kfold_tmp[key] = [0,0]

  kfold_tmp[key][1] += res
  kfold_tmp[key][0] += 1
  if kfold_tmp[key][0] == 5:
    hist_classification_results(name, target, param, kfold_tmp[key][1]/5)

In [49]:
from sklearn.model_selection import KFold

def run_cross_validation(df, atts, targets):
  print(atts)
  kfold_tmp.clear()
  create_best_hyperparameter2_hist()
  cross_group = KFold(n_splits=5, shuffle=True, random_state=1234567)
  cross_group.get_n_splits(df)
  for train_index, test_index in cross_group.split(df):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]
    att_col2 = atts
    targets_col2 = targets
    run_general_classification("knn", KNeighborsClassifier, range(3,30), train_set, test_set, atts, targets, more_options=True, extra_func=kfold_fill_hist_results)
    run_general_classification("decision tree", build_DecisionTreeClassifier, range(1,50), train_set, test_set, atts, targets, more_options=True, extra_func=kfold_fill_hist_results)
    # run_general_classification("svm", build_LinearSVC, np.linspace(0.0001, 4, 40), train_set, test_set, atts, targets, more_options=True, extra_func=kfold_fill_hist_results)
    # run_general_classification("svm", build_LinearSVC, np.linspace(25, 65, 80), train_set, test_set, atts, targets, more_options=True, extra_func=kfold_fill_hist_results)
    # this is what we uselly run to tune svm hyperparameter but it take couple of minutes... 
    # in order to run it faster we run only part of the hyperparameter range, focus on the range which we saw that are the best
    run_general_classification("svm", build_LinearSVC, np.linspace(2, 4, 10), train_set, test_set, atts, targets, more_options=True, extra_func=kfold_fill_hist_results)
    run_general_classification("svm", build_LinearSVC, np.linspace(44, 46, 10), train_set, test_set, atts, targets, more_options=True, extra_func=kfold_fill_hist_results)
  # print(best_hyperparameter2)
  return best_hyperparameter2

In [None]:
run_cross_validation(df=all_examples_data, atts=att_col, targets=targets_col)
best_hyperparameter2

**more experiments that help us in the process of understanding the task but we did not use in our final report**

In [51]:
# def highlighting_important_features(dataset, alpha):
 
#   most_important_features = ['BMI', 'ConversatiosPerDay', 'DisciplineScore',  'HappinessScore', 'HouseholdExpenseOnPresents','HouseholdExpenseOnSocialGames',
#                              'MedicalCarePerYear', 'PCR_10', 'PCR_17', 'PCR_19', 'SocialMediaPerDay', 'PCR_8', 'PCR_72', 
#                              'PCR_10', 'PCR_95', 'PCR_76', 'PCR_7', 'PCR_9', 'PCR_89', 'PCR_19', 'PCR_93']
#   for col in most_important_features:
#     dataset[col] = dataset[col]*alpha
  
#   return dataset


# def tuning_features(dataset):  
#   dataset['AgeGroup'] = dataset['AgeGroup']*0.235
#   dataset['BMI'] = dataset['BMI']*2.45
#   dataset['HouseholdExpenseOnSocialGames'] = dataset['HouseholdExpenseOnSocialGames']*2.9
#   dataset['PCR_89'] = dataset['PCR_89']*3
  
#   return dataset

In [52]:
# best_res = 0
# best_combination = ""

# for att in ["AgeGroup", "BMI", "HouseholdExpenseOnSocialGames", 'PCR_89']:
#   for alpha in [0.5, 1.5, 2.5]:
#     train_copy = train.copy()
#     validation_copy = validation.copy()

#     validation_targets = validation_copy[targets_col]
#     train_targets = train[targets_col]

#     train_copy[att] = train[att]*alpha
#     validation_copy[att] = validation[att]*alpha

#     validation_examples = validation_copy[att_col]
#     train_examples = train_copy[att_col]

#     for tar in targets_col:
#       classifer = KNeighborsClassifier(12)
#       classifer = classifer.fit(train_examples, train_targets[tar])
#       res = classifer.score(validation_examples, validation_targets[tar])
#       print(str(att) + ": " + str(alpha) + ": " + tar +": res: " + str(res))


In [53]:
# check_att = ['ID', 'AgeGroup', 'BMI', 'BloodType',
#        'ConversatiosPerDay', 'x_location', 'y_location',
#        'MonthOfPCRTest', 'DisciplineScore', 'HappinessScore',
#        'HouseholdExpenseOnPresents', 'HouseholdExpenseOnSocialGames',
#        'HouseholdExpenseParkingTicketsPerYear', 'TicketsPerYearGroup',
#        'MedicalCarePerYear', 'PCR_10', 'PCR_17', 'PCR_19', 'PCR_32', 'PCR_45',
#        'PCR_46', 'PCR_7', 'PCR_72', 'PCR_76', 'PCR_8', 'PCR_89', 'PCR_9',
#        'PCR_93', 'PCR_95',
#        'Is_having_Chills', 'Is_having_Congestion_or_runny nose',
#        'Is_having_Low_appetite', 'Is_having_Diarrhea', 'Is_having_Cough',
#        'Is_having_Fever', 'Is_having_Skin_redness',
#        'Is_having_Shortness_of_breath', 'Is_having_Fatigue',
#        'Is_having_New_loss_of_taste_or_smell', 'Is_having_Nausea_or_vomiting',
#        'Is_having_Sore_throat', 'Is_having_Headache',
#        'Is_having_Muscle_or_body_aches', 'Sex', 'SocialMediaPerDay',
#        'StudingPerDay']
# all_raw_data = pd.concat([train_raw, validation_raw, test_raw], ignore_index=True)
# tmp = check_att.copy()
# tmp.extend(targets_col)
# all_raw_data = all_raw_data[tmp].copy()
# all_raw_data

In [54]:
# for col in check_att:
#   if col not in att_col:
#     tmp = att_col.copy()
#     tmp.append(col)
#     print("checking about adding " + str(col))
#     res = run_cross_validation(df=all_raw_data, atts=tmp, targets=targets_col)
#     print("knn: virus " + str(res['Virus_accurcy']['knn(1)']) + " , spreadlevel " + str(res['SpreadLevel_accurcy']['knn(1)']) + " , risk " + str(res['Risk_accurcy']['knn(1)']))
#     print("decision tree: virus " + str(res['Virus_accurcy']['decision tree(1)']) + " , spreadlevel " + str(res['SpreadLevel_accurcy']['decision tree(1)']) + " , risk " + str(res['Risk_accurcy']['decision tree(1)']))
#     print("linear svm: virus " + str(res['Virus_accurcy']['linear svm(1)']) + " , spreadlevel " + str(res['SpreadLevel_accurcy']['linear svm(1)']) + " , risk " + str(res['Risk_accurcy']['linear svm(1)']))
#     print(res)

In [55]:
# for att in att_col:
#   tmp = [att]
#   print("checking score only for " + str(att))
#   res = run_cross_validation(df=all_raw_data, atts=tmp, targets=targets_col)
#   virus_score = res['Virus_accurcy']['knn(1)'] + res['Virus_accurcy']['decision tree(1)'] + res['Virus_accurcy']['linear svm(1)']
#   spread_score = res['SpreadLevel_accurcy']['knn(1)'] + res['SpreadLevel_accurcy']['decision tree(1)'] + res['SpreadLevel_accurcy']['linear svm(1)']
#   risk_score = res['Risk_accurcy']['knn(1)'] + res['Risk_accurcy']['decision tree(1)'] + res['Risk_accurcy']['linear svm(1)']
#   avg_score = (virus_score + spread_score + risk_score)/3
#   each_att_targets_score.loc[att] = [virus_score, spread_score, risk_score, avg_score]

#   knn_score = res['Virus_accurcy']['knn(1)'] + res['SpreadLevel_accurcy']['knn(1)'] + res['Risk_accurcy']['knn(1)']
#   dt_score = res['Virus_accurcy']['decision tree(1)'] + res['SpreadLevel_accurcy']['decision tree(1)'] + res['Risk_accurcy']['decision tree(1)']
#   svm_score = res['Virus_accurcy']['linear svm(1)'] + res['SpreadLevel_accurcy']['linear svm(1)'] + res['Risk_accurcy']['linear svm(1)']
#   avg_score = (knn_score + dt_score + svm_score)/3
#   each_att_classifer_score.loc[att] = [knn_score, dt_score, svm_score, avg_score]

In [56]:
# for att in att_col:
#   best_alpha = 0
#   best_acc = 0
#   for alpha in np.linspace(0.1, 3.5, 20):
#     train_copy = train.copy()
#     validation_copy = validation.copy()
#     train_copy[att] = train_copy[att]*alpha
#     validation_copy[att] = validation_copy[att]*alpha

#     validation_examples = validation_copy[att_col]
#     validation_targets = validation_copy[targets_col]

#     train_examples = train_copy[att_col]
#     train_targets = train_copy[targets_col]

#     print("att: " + att + ", alpha: " + str(alpha))
#     res = run_basic_classification('knn', KNeighborsClassifier, range(4,20))
#     if res > best_acc:
#       best_acc = res
#       best_alpha = alpha
#   print("@@@ best alpha for " + att + " is: ~" + str(best_alpha) + " with avg accurcy of " + str(best_acc))


# Part 6 - Bonuses

In [None]:
final_train = pd.concat([train, validation, test])[all_cols]
final_train = final_train.dropna()

pd.options.mode.chained_assignment = None  # default='warn'
test_hw2 = data_transformation(virus_test_raw.copy())
pd.options.mode.chained_assignment = 'warn'
test_hw2

In [58]:
bonusDF = pd.DataFrame(index=np.arange(test_hw2.shape[0]), columns = ['ID', 'Virus', 'SpreadLevel', 'Risk'])
bonusDF['ID'] = test_hw2['ID']

dt_virus_classifier = build_DecisionTreeClassifier(best_hyperparameter2['Virus_param']["decision tree(1)"])
dt_virus_classifier.fit(final_train[att_col], final_train['Virus'])
bonusDF['Virus'] = dt_virus_classifier.predict(test_hw2[att_col])

dt_spreadLevel_classifier = build_DecisionTreeClassifier(best_hyperparameter2['SpreadLevel_param']["decision tree(1)"])
dt_spreadLevel_classifier.fit(final_train[att_col], final_train['SpreadLevel'])
bonusDF['SpreadLevel'] = dt_spreadLevel_classifier.predict(test_hw2[att_col])

dt_risk_classifier = build_DecisionTreeClassifier(best_hyperparameter2['Risk_param']["decision tree(1)"])
dt_risk_classifier.fit(final_train[att_col], final_train['Risk'])
bonusDF['Risk'] = dt_spreadLevel_classifier.predict(test_hw2[att_col])

bonusDF.to_csv("pred_custom.csv", index=False)
files.download("pred_custom.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Part 7

**create output files**

In [None]:
best_hyperparameter2
# from here we choose the best param for each classifer and target

In [None]:
virus_test = virus_test_raw.copy()
virus_test

In [None]:
final_train = pd.concat([train, validation, test])[all_cols]
final_train = final_train.dropna()
final_train

In [62]:
# transformation_helper

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
test_hw2 = data_transformation(virus_test.copy())
pd.options.mode.chained_assignment = 'warn'
test_hw2

In [64]:
def predict_per_target(virus_classifier,spreadLevel_classifier, risk_classifier , df, id):
  final = pd.DataFrame(index=np.arange(df.shape[0]), columns = ['ID', 'Virus', 'SpreadLevel', 'Risk'])
  final['Virus'] = virus_classifier.predict(df[att_col])
  final['SpreadLevel'] = spreadLevel_classifier.predict(df[att_col])
  final['Risk'] = risk_classifier.predict(df[att_col])
  final['ID'] = id
  return final            

In [65]:
def create_file(classifier_ctor, classifier_name):
  virus_classifier = classifier_ctor(best_hyperparameter2['Virus_param'][classifier_name + "(1)"])  # use best_hyperparameter2 not best_hyperparameter!
  virus_classifier.fit(final_train[att_col], final_train['Virus'])
  spreadLevel_classifier = classifier_ctor(best_hyperparameter2['SpreadLevel_param'][classifier_name + "(1)"])
  spreadLevel_classifier.fit(final_train[att_col], final_train['SpreadLevel'])
  risk_classifier = classifier_ctor(best_hyperparameter2['Risk_param'][classifier_name + "(1)"])
  risk_classifier.fit(final_train[att_col], final_train['Risk'])
  id = virus_test['ID']
  df = predict_per_target(virus_classifier, spreadLevel_classifier, risk_classifier, test_hw2, id)
  return df


In [66]:
df_knn = create_file(KNeighborsClassifier, 'knn')
df_dt = create_file(build_DecisionTreeClassifier, 'decision tree')
df_svm = create_file(build_LinearSVC, 'svm')



In [67]:
def download_file(df, name):
  df.to_csv(name, index=False)
  files.download(name)

In [68]:
download_file(df_dt, 'pred_decision_tree.csv')
download_file(df_knn, 'pred_knn.csv')
download_file(df_svm, 'pred_svm.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [69]:
print("DONE")

DONE
