In [43]:
# import libraries

import numpy as np 
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
import random

from google.colab import files
from scipy import sparse

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold

from sklearn.feature_selection import SelectKBest, f_regression, chi2
from numpy import mean
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [44]:

train_data_url = 'https://raw.githubusercontent.com/ridwant/DataMinig/main/hw2train.txt'
test_data_url = 'https://raw.githubusercontent.com/ridwant/DataMinig/main/1664296410_921989_test.txt'

train_df = pd.read_table(train_data_url, header=None, skip_blank_lines=False, names=['active', 'attribute'])
test_df = pd.read_table(test_data_url, header=None, skip_blank_lines=False, names=['attribute'])
train_df['active'] = train_df['active'].map({1: 1, 0: -1})

In [45]:
def create_sparse_matrix(samples):
  vals = []
  row_idxs = []
  col_idxs = []
  for idx, sample in enumerate(samples):
    sample = sample.strip().split(" ")
    for s in sample:
      if s.strip() != '':
        col_idxs.append(int(s))
        row_idxs.append(idx)
        vals.append(1)
  csc = sparse.csc_matrix((vals, (row_idxs, col_idxs)))

  return csc

In [46]:
class AdaBoostClassifier:

  def __init__(self, n_estimators=20):
    self.alpha = []
    self.weak_estimator = []
    self.n_estimators = n_estimators
    self.errors = []

  def misclassification_rate(self, misclassified_labels, weights):
    # misclassification rate / sum of the weights 
    return (sum(weights * misclassified_labels))/sum(weights)

  def get_alpha(self, err):
    return np.log((1 - err) / err)

  def update_weights(self, misclassified_labels, weights, alpha):
    return weights * np.exp(alpha * misclassified_labels)
  
  def fit(self, X, y, alter_base=False):
    weights_i = (np.ones(X.shape[0]) * 1) / X.shape[0]
    for i in range(self.n_estimators):
      if not alter_base:
        estimator = DecisionTreeClassifier(max_depth=1, random_state=123)
      else:
        estimator = BernoulliNB()
      estimator.fit(X,y, sample_weight=weights_i)
      self.weak_estimator.append(estimator)
      y_pred = estimator.predict(X)
      y_true = y.values
      misclassified_labels = np.not_equal(y_true, y_pred).astype(int)
      error_i = self.misclassification_rate(misclassified_labels, weights_i)
      self.errors.append(error_i)
      alpha_i = self.get_alpha(error_i)
      self.alpha.append(alpha_i)
      weights_i = self.update_weights(misclassified_labels, weights_i, alpha_i)
  

  def predict(self, X):
    y_pred_i = [self.alpha[idx] * estimator.predict(X) for idx, estimator in enumerate(self.weak_estimator)]
    y_pred = np.sum(y_pred_i,axis=0)
    y_pred = np.sign(y_pred)
    return y_pred

In [47]:
def stratified_kFold_cross_validation_with_adaboost(X, Y, fold=5, n_estimators=20, alter_base = False):
  ab = AdaBoostClassifier(n_estimators=n_estimators)
  skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=1)
  avg_recall = 0.0
  avg_prec = 0.0
  avg_f1 = 0.0
  for train, test in skf.split(X, Y):
    ab.fit(X[train], Y[train], alter_base = alter_base)
    prediction =  ab.predict(X[test])
    avg_recall += recall_score(Y[test], prediction)
    avg_prec += precision_score(Y[test], prediction)
    avg_f1 += f1_score(Y[test], prediction)
  
  return avg_recall/fold, avg_prec/fold, avg_f1/fold


def cross_validation_with_decision_tree(X, y, seed=1, n_estimators=79):
  prec_arr = []
  rec_arr = []
  f1_arr = []
  best_k = []
  for k in range(190, 300, 1):
    under = RandomUnderSampler(random_state=seed)
    X_resampled, y_resampled = under.fit_resample(X, y)
    fs = SelectKBest(score_func=chi2, k=k)
    X_selected = fs.fit_transform(X_resampled, y_resampled)
    rec, prec, f1 = stratified_kFold_cross_validation_with_adaboost(X_selected, y_resampled, n_estimators=n_estimators)
    best_k.append(k)
    prec_arr.append(prec)
    rec_arr.append(rec)
    f1_arr.append(f1)
    print('For %d-Best Feature, Recall-Score: %.3f Precision-Score: %.3f F1-Score: %.3f' % (k, rec, prec, f1))
  
  plt.plot(best_k, rec_arr, label = "recall")
  plt.plot(best_k, prec_arr, label = "precision")
  plt.plot(best_k, f1_arr, label = "f1-score")
  plt.xlabel('Score')
  plt.xlabel('K-best Feature Number')
  plt.title('Experiment With AdaBoost (Base = Decision Tree)')
  plt.legend()
  plt.show()


def cross_validation_with_decision_tree_n_estimators(X, y, seed=1):
  prec_arr = []
  rec_arr = []
  f1_arr = []
  best_n = []
  for n in range(3, 50, 1):
    under = RandomUnderSampler(random_state=seed)
    X_resampled, y_resampled = under.fit_resample(X, y)
    fs = SelectKBest(score_func=chi2, k=255)
    X_selected = fs.fit_transform(X_resampled, y_resampled)
    rec, prec, f1 = stratified_kFold_cross_validation_with_adaboost(X_selected, y_resampled, n_estimators=n)
    best_n.append(n)
    prec_arr.append(prec)
    rec_arr.append(rec)
    f1_arr.append(f1)
    print('For %d-Number of Estimators, Recall-Score: %.3f Precision-Score: %.3f F1-Score: %.3f' % (n, rec, prec, f1))
  
  plt.plot(best_n, rec_arr, label = "recall")
  plt.plot(best_n, prec_arr, label = "precision")
  plt.plot(best_n, f1_arr, label = "f1-score")
  plt.xlabel('Score')
  plt.xlabel('Number of Estimators')
  plt.title('Experiment With AdaBoost (Base = Decision Tree)')
  plt.legend()
  plt.show()


def cross_validation_with_bernouli(X, y, seed=1):
  prec_arr = []
  rec_arr = []
  f1_arr = []
  best_n = []
  for n in range(1, 30, 1):
    under = RandomUnderSampler(random_state=seed)
    X_resampled, y_resampled = under.fit_resample(X, y)
    fs = SelectKBest(score_func=chi2, k=255)
    X_selected = fs.fit_transform(X_resampled, y_resampled)
    rec, prec, f1 = stratified_kFold_cross_validation_with_adaboost(X_selected, y_resampled, n_estimators=n, alter_base=True)
    best_n.append(n)
    prec_arr.append(prec)
    rec_arr.append(rec)
    f1_arr.append(f1)
    print('For %d-Number of Estimators, Recall-Score: %.3f Precision-Score: %.3f F1-Score: %.3f' % (n, rec, prec, f1))
  
  plt.plot(best_n, rec_arr, label = "recall")
  plt.plot(best_n, prec_arr, label = "precision")
  plt.plot(best_n, f1_arr, label = "f1-score")
  plt.xlabel('Score')
  plt.xlabel('Number of Estimators')
  plt.title('Experiment With AdaBoost (Base = Bernoulli Naive Bayes)')
  plt.legend()
  plt.show()

In [48]:
def exp_1():
  X = create_sparse_matrix(train_df['attribute'])
  Y = train_df['active']
  cross_validation_with_decision_tree(X, Y)

# exp_1()

In [49]:
def exp_2():
  X = create_sparse_matrix(train_df['attribute'])
  Y = train_df['active']
  cross_validation_with_decision_tree_n_estimators(X, Y)

# exp_2()

In [50]:
def exp_3():
  X = create_sparse_matrix(train_df['attribute'])
  Y = train_df['active']
  cross_validation_with_bernouli(X, Y)

# exp_3()

In [None]:
# Final Prediction on Test Data


X = create_sparse_matrix(train_df['attribute'])
Y = train_df['active']

under =  RandomUnderSampler(random_state=1)
X_resampled, y_resampled = under.fit_resample(X, Y)

fs = SelectKBest(score_func=chi2, k=255)
X_selected = fs.fit_transform(X_resampled, y_resampled)

ab = AdaBoostClassifier(n_estimators=14)
ab.fit(X_selected, y_resampled, alter_base=True)

X_test = create_sparse_matrix(test_df['attribute'])
X_test_reduced = fs.transform(X_test)

predictions = ab.predict(X_test_reduced)
df = pd.DataFrame(predictions)
df[0] = df[0].map({1: 1, -1: 0})
df.to_csv('final_prediction.csv', index=False, header=False) 
files.download("final_prediction.csv")