In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from util import preprocess, convert_class
import pickle
from sklearn.feature_selection import SelectPercentile

[nltk_data] Downloading package wordnet to /home/rajat499/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rajat499/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
if __name__ == "__main__":
    
    filename = sys.argv[1]
    model_path = sys.argv[2]
    
    train = pd.read_csv(filename)
    train = preprocess(train, 'Subject')
    train = preprocess(train, "Content")
    df = convert_class(train)
    
    vectorizer = TfidfVectorizer().fit(df["Subject"] + " " + df["Content"])
    X_train = vectorizer.transform(df["Subject"]+" "+df["Content"])
    
    vect_path = "vectorizer.pkl"  
    with open(vect_path, 'wb') as file:  
        pickle.dump(vectorizer, file)
        
    clf = BaggingClassifier(base_estimator=SGDClassifier(), random_state=3, n_estimators=12, n_jobs=-3)
    clf = clf.fit(X_train, df.Class)
    
    pred = clf.predict(X_train)
    mat = confusion_matrix(pred, df.Class)
    total = 0
    for i in range(mat.shape[0]):
        total += mat[i][i]/sum(mat[i])

    print("Micro Accuraccy: ", total/mat.shape[0]) 
    print("Macro Accuracy: ", np.mean(pred == df.Class))
    
    with open(model_path, 'wb') as file:  
        pickle.dump(clf, file)

In [9]:
train = pd.read_csv("train.csv")
train = preprocess(train, 'Subject')
train = preprocess(train, "Content")
df = convert_class(train)

val = pd.read_csv("val.csv")
val = preprocess(val, "Subject")
val = preprocess(val, "Content")
val = convert_class(val)

vectorizer = TfidfVectorizer(ngram_range=(1, 3)).fit(df["Subject"] + " " + df["Content"])
X_train = vectorizer.transform(df["Subject"]+" "+df["Content"])

acc = {}
for p in range(1, 101, 1):
    select = SelectPercentile(percentile=p)
    select.fit(X_train, df.Class)
    tr = select.transform(X_train)

    clf = BaggingClassifier(base_estimator=SGDClassifier(), random_state=3, n_estimators=12, n_jobs=-3)
    clf = clf.fit(tr, df.Class)

    X_val = select.transform(vectorizer.transform(val["Subject"]+" "+val["Content"]))

    pred = clf.predict(X_val)
    mat = confusion_matrix(pred, val.Class)
    total = 0
    for i in range(mat.shape[0]):
        total += mat[i][i]/sum(mat[i])

    print(p, "Micro Accuraccy: ", total/mat.shape[0], "Macro Accuracy: ", np.mean(pred == val.Class))
    acc[p] = (total/mat.shape[0] + np.mean(pred == val.Class))/2

1 Micro Accuraccy:  0.8746211250457631 Macro Accuracy:  0.8097345132743363
2 Micro Accuraccy:  0.8152716480395815 Macro Accuracy:  0.8230088495575221
3 Micro Accuraccy:  0.8368728041390907 Macro Accuracy:  0.8362831858407079
4 Micro Accuraccy:  0.8601392071980306 Macro Accuracy:  0.8451327433628318
5 Micro Accuraccy:  0.8620629370629371 Macro Accuracy:  0.8407079646017699
6 Micro Accuraccy:  0.8596505758889197 Macro Accuracy:  0.8407079646017699
7 Micro Accuraccy:  0.8444533998398168 Macro Accuracy:  0.831858407079646
8 Micro Accuraccy:  0.8429364907268354 Macro Accuracy:  0.827433628318584
9 Micro Accuraccy:  0.8542785398122478 Macro Accuracy:  0.831858407079646
10 Micro Accuraccy:  0.8466895658062229 Macro Accuracy:  0.831858407079646
11 Micro Accuraccy:  0.8422151618872931 Macro Accuracy:  0.827433628318584
12 Micro Accuraccy:  0.8457209457209457 Macro Accuracy:  0.827433628318584
13 Micro Accuraccy:  0.8354919556956908 Macro Accuracy:  0.8097345132743363
14 Micro Accuraccy:  0.8403

In [10]:
sorted(acc.items(), key=lambda x: x[1])

[(42, 0.8038899575012854),
 (56, 0.805176854632728),
 (60, 0.8110451029167596),
 (62, 0.8124384012178028),
 (64, 0.8129906203400318),
 (61, 0.8139906505148974),
 (41, 0.8141779442460626),
 (54, 0.8152922722802288),
 (44, 0.815825816982645),
 (45, 0.815825816982645),
 (68, 0.8185364301604452),
 (55, 0.8186224764837364),
 (52, 0.8190990119013393),
 (2, 0.8191402487985517),
 (53, 0.8192266678700635),
 (46, 0.8201738484988181),
 (43, 0.8214175985668455),
 (48, 0.8214175985668455),
 (49, 0.8214175985668455),
 (59, 0.8214175985668455),
 (66, 0.8214175985668455),
 (71, 0.8214175985668455),
 (69, 0.8220341968808121),
 (15, 0.822386159827347),
 (13, 0.8226132344850136),
 (30, 0.8232279484946994),
 (47, 0.8242825841634505),
 (50, 0.8243670003145434),
 (51, 0.8243670003145434),
 (63, 0.8243670003145434),
 (65, 0.8243670003145434),
 (73, 0.8243670003145434),
 (74, 0.8243670003145434),
 (75, 0.8243670003145434),
 (76, 0.8243670003145434),
 (77, 0.8243670003145434),
 (78, 0.8243670003145434),
 (79, 