In [1]:
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.neural_network import MLPClassifier

def preprocess(filename):
    f = open(filename, 'r')
    data_table = {}
    data_table["data"] = [];
    data_table["target"] = [];
    for line in f.readlines():
        data = line.strip().split(",")
        attribute_values = data[1:-1]
        class_values = data[-1]
        data_table["data"].append(attribute_values)
        data_table["target"].append(class_values)
    
    data_table["data"] = np.array(data_table["data"]).astype(np.float)
    return data_table

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        #print(yhats.shape)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)

In [2]:
data = []

top10 = preprocess("../2019S1-proj2-data/train-top10.csv");
top10val = preprocess("../2019S1-proj2-data/dev-top10.csv");
X10 = top10["data"]
y10 = top10["target"]
X10val = top10val["data"]
y10val = top10val["target"]

top50 = preprocess("../2019S1-proj2-data/train-top50.csv");
top50val = preprocess("../2019S1-proj2-data/dev-top50.csv");
X50 = top50["data"]
y50 = top50["target"]
X50val = top50val["data"]
y50val = top50val["target"]

top100 = preprocess("../2019S1-proj2-data/train-top100.csv");
top100val = preprocess("../2019S1-proj2-data/dev-top100.csv");
X100 = top100["data"]
y100 = top100["target"]
X100val = top100val["data"]
y100val = top100val["target"]

data.append((X10,y10,X10val,y10val));
data.append((X50,y50,X50val,y50val));
data.append((X100,y100,X100val,y100val));

top10proc = preprocess("../preprocessed/top10train.csv");
top10valproc = preprocess("../preprocessed/top10dev.csv");
X10proc = top10proc["data"]
y10proc = top10proc["target"]
X10valproc = top10valproc["data"]
y10valproc = top10valproc["target"]

top50proc = preprocess("../preprocessed/top50train.csv");
top50valproc = preprocess("../preprocessed/top50dev.csv");
X50proc = top50proc["data"]
y50proc = top50proc["target"]
X50valproc = top50valproc["data"]
y50valproc = top50valproc["target"]

top100proc = preprocess("../preprocessed/top100train.csv");
top100valproc = preprocess("../preprocessed/top100dev.csv");
X100proc = top100proc["data"]
y100proc = top100proc["target"]
X100valproc = top100valproc["data"]
y100valproc = top100valproc["target"]

data.append((X10proc,y10proc,X10valproc,y10valproc));
data.append((X50proc,y50proc,X50valproc,y50valproc));
data.append((X100proc,y100proc,X100valproc,y100valproc));

dataTitles = ['Top10',
              'Top50',
              'Top100',
              'Top10proc',
              'Top50proc',
              'Top100proc']

In [42]:
"""
KNN Models (model that will took considerably long time to run ~30 min)
"""
models = [KNeighborsClassifier(n_neighbors=1),
          KNeighborsClassifier(n_neighbors=5)]
titles = ['1-Nearest Neighbour',
          '5-Nearest Neighbour']

for ((X,y,Xval,yval),dataTitle) in zip(data,dataTitles):
    # timer on
    start = time.time()
    print("For",dataTitle);

    title_training_acc = {}
    title_validation_acc = {}
    for title, model in zip(titles, models):
        model.fit(X, y)
        title_training_acc[title] = model.score(X, y)
        title_validation_acc[title] = model.score(Xval, yval)
    
    end = time.time()
    print('Time elapsed:',end - start);

    for title in titles:
        print(title, ': Training Acc', title_training_acc[title], '; Validation Acc', title_validation_acc[title])
    
    print("");

For Top10
Time elapsed: 1346.61643242836
1-Nearest Neighbour : Training Acc 0.282651600170272 ; Validation Acc 0.2879997856147497
5-Nearest Neighbour : Training Acc 0.2916973027359622 ; Validation Acc 0.2953960767499196

For Top50
Time elapsed: 7286.453837156296
1-Nearest Neighbour : Training Acc 0.31150110289849464 ; Validation Acc 0.29097438096258976
5-Nearest Neighbour : Training Acc 0.32045973453039744 ; Validation Acc 0.30088969878872335

For Top100
Time elapsed: 13718.756433725357
1-Nearest Neighbour : Training Acc 0.32886691691498005 ; Validation Acc 0.2965751956265409
5-Nearest Neighbour : Training Acc 0.3384834178243876 ; Validation Acc 0.3061689355772323



In [None]:
"""
Decision Tree
"""

models = [DecisionTreeClassifier(max_depth=1),
          DecisionTreeClassifier(max_depth=5),
          DecisionTreeClassifier(max_depth=10),
          DecisionTreeClassifier(max_depth=None)]

titles = ['1-R',
          'Decision Tree depth 5',
          'Decision Tree depth 10',
          'Decision Tree']

for ((X,y,Xval,yval),dataTitle) in zip(data,dataTitles):
    # timer on
    start = time.time()
    print("For",dataTitle);

    title_training_acc = {}
    title_validation_acc = {}
    for title, model in zip(titles, models):
        model.fit(X, y)
        title_training_acc[title] = model.score(X, y)
        title_validation_acc[title] = model.score(Xval, yval)
    
    end = time.time()
    print('Time elapsed:',end - start);

    for title in titles:
        print(title, ': Training Acc', title_training_acc[title], '; Validation Acc', title_validation_acc[title])
    
    print("");

# title_crossvalidation_acc = {}
# for title, model in zip(titles, models):
#     title_crossvalidation_acc[title] = np.mean(cross_val_score(model, X, y, cv=10))

For Top10
Time elapsed: 2.6967573165893555
1-R : Training Acc 0.2581653186796177 ; Validation Acc 0.2625147389859578
Decision Tree depth 5 : Training Acc 0.2767017530281336 ; Validation Acc 0.283738878765141
Decision Tree depth 10 : Training Acc 0.2840834333036647 ; Validation Acc 0.28781219852074175
Decision Tree : Training Acc 0.2932839286405325 ; Validation Acc 0.2948333154678958

For Top50
Time elapsed: 31.114784002304077
1-R : Training Acc 0.2581653186796177 ; Validation Acc 0.2625147389859578
Decision Tree depth 5 : Training Acc 0.27672110212453077 ; Validation Acc 0.283738878765141
Decision Tree depth 10 : Training Acc 0.2842382260748423 ; Validation Acc 0.287597813270447
Decision Tree : Training Acc 0.33005688634340774 ; Validation Acc 0.30056812091328117

For Top100
Time elapsed: 84.6716206073761
1-R : Training Acc 0.2581653186796177 ; Validation Acc 0.2625147389859578
Decision Tree depth 5 : Training Acc 0.276740451220928 ; Validation Acc 0.283738878765141
Decision Tree depth

In [3]:
"""
Naive Bayes
"""
models = [GaussianNB(),
          MultinomialNB(alpha=1.0),
          MultinomialNB(alpha=0.5),
          MultinomialNB(alpha=0),
          BernoulliNB()]

titles = ['Gaussian Naive Bayes',
          'Multinomial Naive Bayes laplace smoothing',
          'Multinomial Naive Bayes 0.5 smoothing',
          'Multinomial Naive Bayes no smoothing',
          'Binomial Naive Bayes']

for ((X,y,Xval,yval),dataTitle) in zip(data,dataTitles):
    # timer on
    start = time.time()
    print("For",dataTitle);

    title_training_acc = {}
    title_validation_acc = {}
    for title, model in zip(titles, models):
        model.fit(X, y)
        title_training_acc[title] = model.score(X, y)
        title_validation_acc[title] = model.score(Xval, yval)
    
    end = time.time()
    print('Time elapsed:',end - start);

    for title in titles:
        print(title, ': Training Acc', title_training_acc[title], '; Validation Acc', title_validation_acc[title])
    
    print("");
        
# title_crossvalidation_acc = {}
# for title, model in zip(titles, models):
#     title_crossvalidation_acc[title] = np.mean(cross_val_score(model, X, y, cv=10))

For Top10


  'setting alpha = %.1e' % _ALPHA_MIN)


Time elapsed: 2.6655423641204834
Gaussian Naive Bayes : Training Acc 0.29154250996478465 ; Validation Acc 0.29480651731160895
Multinomial Naive Bayes laplace smoothing : Training Acc 0.29132966990441544 ; Validation Acc 0.29491370993675636
Multinomial Naive Bayes 0.5 smoothing : Training Acc 0.29132966990441544 ; Validation Acc 0.29491370993675636
Multinomial Naive Bayes no smoothing : Training Acc 0.29132966990441544 ; Validation Acc 0.29491370993675636
Binomial Naive Bayes : Training Acc 0.29136836809720984 ; Validation Acc 0.2948333154678958

For Top50


  'setting alpha = %.1e' % _ALPHA_MIN)


Time elapsed: 5.644241809844971
Gaussian Naive Bayes : Training Acc 0.3197728416082969 ; Validation Acc 0.29842426841033337
Multinomial Naive Bayes laplace smoothing : Training Acc 0.3195793506443249 ; Validation Acc 0.30134526744559975
Multinomial Naive Bayes 0.5 smoothing : Training Acc 0.3195890251925235 ; Validation Acc 0.30137206560188656
Multinomial Naive Bayes no smoothing : Training Acc 0.3195890251925235 ; Validation Acc 0.30134526744559975
Binomial Naive Bayes : Training Acc 0.3206919236871638 ; Validation Acc 0.3007021116947154

For Top100


  'setting alpha = %.1e' % _ALPHA_MIN)


Time elapsed: 9.311599254608154
Gaussian Naive Bayes : Training Acc 0.3355520297202121 ; Validation Acc 0.3014524600707471
Multinomial Naive Bayes laplace smoothing : Training Acc 0.33679037188963273 ; Validation Acc 0.3078304212670168
Multinomial Naive Bayes 0.5 smoothing : Training Acc 0.33679037188963273 ; Validation Acc 0.3078840175795905
Multinomial Naive Bayes no smoothing : Training Acc 0.3367806973414342 ; Validation Acc 0.3078840175795905
Binomial Naive Bayes : Training Acc 0.33762238303471226 ; Validation Acc 0.3070264765784114

For Top10proc


  'setting alpha = %.1e' % _ALPHA_MIN)


Time elapsed: 2.4691522121429443
Gaussian Naive Bayes : Training Acc 0.28697612321504584 ; Validation Acc 0.28746382248901275
Multinomial Naive Bayes laplace smoothing : Training Acc 0.3045354281955033 ; Validation Acc 0.3154946939650552
Multinomial Naive Bayes 0.5 smoothing : Training Acc 0.3045354281955033 ; Validation Acc 0.3154946939650552
Multinomial Naive Bayes no smoothing : Training Acc 0.3045451027437019 ; Validation Acc 0.31552149212134206
Binomial Naive Bayes : Training Acc 0.3044290081653187 ; Validation Acc 0.31525351055847356

For Top50proc


  'setting alpha = %.1e' % _ALPHA_MIN)


Time elapsed: 5.637939214706421
Gaussian Naive Bayes : Training Acc 0.31868929221005377 ; Validation Acc 0.2922338943080716
Multinomial Naive Bayes laplace smoothing : Training Acc 0.3255291977864634 ; Validation Acc 0.31691499624825814
Multinomial Naive Bayes 0.5 smoothing : Training Acc 0.3255775705274564 ; Validation Acc 0.31691499624825814
Multinomial Naive Bayes no smoothing : Training Acc 0.3255678959792578 ; Validation Acc 0.31691499624825814
Binomial Naive Bayes : Training Acc 0.3253744050152858 ; Validation Acc 0.31629863865366065

For Top100proc


  'setting alpha = %.1e' % _ALPHA_MIN)


Time elapsed: 38.083194732666016
Gaussian Naive Bayes : Training Acc 0.386981927943965 ; Validation Acc 0.28470361239146746
Multinomial Naive Bayes laplace smoothing : Training Acc 0.40314809798382417 ; Validation Acc 0.328009432951013
Multinomial Naive Bayes 0.5 smoothing : Training Acc 0.4035544290081653 ; Validation Acc 0.327821845857005
Multinomial Naive Bayes no smoothing : Training Acc 0.4042219728338687 ; Validation Acc 0.3278486440132919
Binomial Naive Bayes : Training Acc 0.4144092720869935 ; Validation Acc 0.3232393611319541



In [15]:
multiNBbagging = BaggingClassifier(base_estimator= MultinomialNB(), max_samples = 0.35, max_features = 0.9)
adaMNB = AdaBoostClassifier(base_estimator=MultinomialNB(), n_estimators=100)

models = [multiNBbagging, adaMNB]

titles = ['Multinomial Naive Bayes laplace smoothing', 'ADA MNB']


for ((X,y,Xval,yval),dataTitle) in zip(data,dataTitles):
    # timer on
    start = time.time()
    print("For",dataTitle);
    
    title_training_acc = {}
    title_validation_acc = {}
    for title, model in zip(titles, models):
        model.fit(X, y)
        title_training_acc[title] = model.score(X, y)
        title_validation_acc[title] = model.score(Xval, yval)
    
    end = time.time()
    print('Time elapsed:',end - start);

    for title in titles:
        print(title, ': Training Acc', title_training_acc[title], '; Validation Acc', title_validation_acc[title])
    
    print("");


For Top10
Time elapsed: 37.535595417022705
Multinomial Naive Bayes laplace smoothing : Training Acc 0.29007197863859757 ; Validation Acc 0.2930110408403902
ADA MNB : Training Acc 0.29149413722379164 ; Validation Acc 0.29464572837388786

For Top50
Time elapsed: 49.62624979019165
Multinomial Naive Bayes laplace smoothing : Training Acc 0.3198599125420843 ; Validation Acc 0.2985582591917676
ADA MNB : Training Acc 0.32021787082543246 ; Validation Acc 0.3001929467252653

For Top100
Time elapsed: 68.94897079467773
Multinomial Naive Bayes laplace smoothing : Training Acc 0.32501644673193764 ; Validation Acc 0.3038910922928503
ADA MNB : Training Acc 0.3350973259548779 ; Validation Acc 0.3061689355772323

For Top10proc
Time elapsed: 37.345722913742065
Multinomial Naive Bayes laplace smoothing : Training Acc 0.28488642080414844 ; Validation Acc 0.2939489763104298
ADA MNB : Training Acc 0.2907008242715065 ; Validation Acc 0.2935202058098403

For Top50proc
Time elapsed: 49.13231420516968
Multinomi

In [22]:
"""
Stacking Classifier
"""

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier 

multiNBbagging = BaggingClassifier(base_estimator= MultinomialNB(), max_samples = 0.5, max_features = 1.0)
adaMNB = AdaBoostClassifier(base_estimator=MultinomialNB())

classifiers = [RandomForestClassifier(),
               multiNBbagging]

meta_classifier = LogisticRegression(solver="saga")

RFMNBstacker = StackingClassifier(classifiers, meta_classifier)

classifiers = [adaMNB,
               multiNBbagging]

meta_classifier = LogisticRegression(solver="saga")

BMNBAMNBstacker = StackingClassifier(classifiers, meta_classifier)

models = [RFMNBstacker,
          BMNBAMNBstacker]

titles = ['RF & B MNB Stacker',
          'B MNB & A MNB Stacker']


for ((X,y,Xval,yval),dataTitle) in zip(data,dataTitles):
    # timer on
    start = time.time()
    print("For",dataTitle);

    title_training_acc = {}
    title_validation_acc = {}
    for title, model in zip(titles, models):
        model.fit(X, y)
        title_training_acc[title] = model.score(X, y)
        title_validation_acc[title] = model.score(Xval, yval)
    
    end = time.time()
    print('Time elapsed:',end - start);

    for title in titles:
        print(title, ': Training Acc', title_training_acc[title], '; Validation Acc', title_validation_acc[title])
    
    print("");


For Top10




Time elapsed: 34.51904821395874
RF & B MNB Stacker : Training Acc 0.29318718315854647 ; Validation Acc 0.2945921320613142
B MNB & A MNB Stacker : Training Acc 0.29129097171162105 ; Validation Acc 0.29486011362418263

For Top50




Time elapsed: 70.84763383865356
RF & B MNB Stacker : Training Acc 0.32977632444564836 ; Validation Acc 0.29858505734805446
B MNB & A MNB Stacker : Training Acc 0.3173638791068457 ; Validation Acc 0.29858505734805446

For Top100




Time elapsed: 127.01181626319885
RF & B MNB Stacker : Training Acc 0.3497542664757556 ; Validation Acc 0.3037035051988423
B MNB & A MNB Stacker : Training Acc 0.33639371541349017 ; Validation Acc 0.303757101511416

For Top10proc




Time elapsed: 34.05537700653076
RF & B MNB Stacker : Training Acc 0.2929646685499787 ; Validation Acc 0.3020420195090578
B MNB & A MNB Stacker : Training Acc 0.2921713555976936 ; Validation Acc 0.29772751634687533

For Top50proc




Time elapsed: 68.29262471199036
RF & B MNB Stacker : Training Acc 0.32822839673387255 ; Validation Acc 0.2986922499732018
B MNB & A MNB Stacker : Training Acc 0.3247939321233698 ; Validation Acc 0.29799549790974383

For Top100proc




Time elapsed: 239.94615721702576
RF & B MNB Stacker : Training Acc 0.3849986455632522 ; Validation Acc 0.308500375174188
B MNB & A MNB Stacker : Training Acc 0.3685035408846407 ; Validation Acc 0.30943831064422767



In [37]:
"""
Preprocess the test data
"""

top100testproc = preprocess("../preprocessed/top100test.csv");
X100testproc = top100testproc["data"]
y100testproc = top100testproc["target"]
top10testproc = preprocess("../preprocessed/top10test.csv");
X10testproc = top10testproc["data"]
y10testproc = top10testproc["target"]
top50testproc = preprocess("../preprocessed/top50test.csv");
X50testproc = top50testproc["data"]
y50testproc = top50testproc["target"]


In [None]:
X100proc
X100valproc
y100proc
y100valproc
ultimateX = []
ultimatey = []

for i in X100proc:
    ultimateX.append(i)
for i in X100valproc:
    ultimateX.append(i)
for i in y100proc:
    ultimatey.append(i)
for i in y100valproc:
    ultimatey.append(i)

In [40]:
"""
Choose the best model to use for submission
"""

multiNBbagging = BaggingClassifier(base_estimator= MultinomialNB(), max_samples = 0.35, max_features = 0.9, warm_start=True)

# classifiers = [RandomForestClassifier(),
#                multiNBbagging]

# meta_classifier = LogisticRegression(solver="saga")

# classifiers = [adaMNB,
#                multiNBbagging]

# meta_classifier = LogisticRegression(solver="saga")


# clf.fit(data_transformed, digits.target)

# model = LogisticRegression(penalty='elasticnet', solver='saga', multi_class='multinomial', max_iter=100, warm_start=True)

model = multiNBbagging

model.fit(X100proc,y100proc)

model.score(X100valproc, y100valproc)

0.33052845964197664

In [None]:
"""
Make the submission csv based on the best model
"""

header = ['Id','Class']

predictionArray = model.predict(X100testproc)
len(predictionArray)
with open('../submission.csv',"w+") as file:
    line = "{},{}".format(header[0],header[1])
    file.write(line)
    file.write('\n')
    

    index = 1
    for prediction in predictionArray:
        line = "3{},{}".format(index, prediction)
        file.write(line)
        file.write('\n')
        index += 1
