In [1]:
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
with open('/train.txt','r') as medicalrecord:
    allAbstracts = medicalrecord.readlines()
    
with open('/test.txt','r') as testrecord:
    tAbstracts = testrecord.readlines()

In [3]:
cls = []
records = []

for x, value in np.ndenumerate(allAbstracts):
    cls.append(value[0][0])
    records.append(value[2:])

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(records, cls, test_size=0.3, random_state=10)


def build_analyzer(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+-?[0-9&/\%@]*\w+(?:[-]\w+)*')
    tokens = tokenizer.tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    posag = [s for s in tagged if s[1] != 'IN' if s[1] != 'CC' if s[1] != 'DT']
    filtered_words = []
    for tagged_word in posag:
        d = tagged_word[0]
        if not d in stopwords.words('english'):
            if len(d) >= 4:
                filtered_words.append(d)
    return filtered_words

In [5]:
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfTransformer
tokenizer = RegexpTokenizer(r'[a-zA-Z]+-?[0-9&/\%@]*\w+(?:[-]\w+)*')

In [6]:
from sklearn.linear_model import SGDClassifier
X_train_counts = CountVectorizer(analyzer=u'word', tokenizer = tokenizer.tokenize, min_df = 0.00015, max_df = 0.5,
                                 ngram_range=(1, 4),stop_words='english').fit(X_train)

In [7]:
myPreprocessed = X_train_counts.transform(X_train)

In [171]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(myPreprocessed)
X_train_tf = tf_transformer.transform(myPreprocessed)

In [172]:
from sklearn.linear_model import SGDClassifier
sd = SGDClassifier(loss='hinge', penalty='l2',alpha=0.00061, random_state=42, max_iter=10)

In [173]:
sd.partial_fit(X_train_tf, np.array(y_train))

SGDClassifier(alpha=0.00061, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=10, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [174]:
# for x, value in enumerate(X_test):
review_counts= X_train_counts.transform(X_test)
review_transformed = tf_transformer.transform(review_counts)

In [175]:
predicted = sd.predict(review_transformed)
print(np.mean(predicted == y_test))
print(classification_report(y_test, predicted))

0.612650046168
             precision    recall  f1-score   support

          1       0.66      0.87      0.75       950
          2       0.56      0.43      0.49       445
          3       0.54      0.42      0.48       542
          4       0.64      0.82      0.72       930
          5       0.57      0.44      0.49      1465

avg / total       0.60      0.61      0.60      4332



In [36]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import time
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

svd = TruncatedSVD(n_components=1000, n_iter=10, random_state=42)

vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)



# Build the tfidf vectorizer from the training data ("fit"), and apply it 
# ("transform").
X_train_tfidf = vectorizer.fit_transform(X_train)

print("  Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1])

print("\nPerforming dimensionality reduction using LSA")
t0 = time.time()
lsa = make_pipeline(svd, Normalizer(copy=False))

# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

print("  done in %.3fsec" % (time.time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

X_test_tfidf = vectorizer.transform(X_test)
X_test_lsa = lsa.transform(X_test_tfidf)


###############################################################################
#  Run classification of the test articles
###############################################################################

print("\nClassifying tfidf vectors...")

# Time this step.
t0 = time.time()


svg_tfidf = SGDClassifier(loss='hinge', penalty='l2',alpha=0.0007, random_state=42, max_iter=15)
svg_tfidf.fit(X_train_tfidf, y_train)

# Classify the test vectors.
p = svg_tfidf.predict(X_test_tfidf)

# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
    if p[i] == y_test[i]:
        numRight += 1

print("  (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))

# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print("  done in %.3fsec" % elapsed)


print("\nClassifying LSA vectors...")

# Time this step.
t0 = time.time()

svg_lsa = SGDClassifier(loss='hinge', penalty='l2',alpha=0.0007, random_state=42, max_iter=15)
svg_lsa.fit(X_train_lsa, y_train)

# Classify the test vectors.
p = svg_lsa.predict(X_test_lsa)

# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
    if p[i] == y_test[i]:
        numRight += 1
        
print(classification_report(y_test, p))

print("  (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))

# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)    
print("    done in %.3fsec" % elapsed)

  Actual number of tfidf features: 10000

Performing dimensionality reduction using LSA
  done in 40.460sec
  Explained variance of the SVD step: 56%

Classifying tfidf vectors...
  (2666 / 4332) correct - 61.54%
  done in 0.336sec

Classifying LSA vectors...
             precision    recall  f1-score   support

          1       0.71      0.84      0.77       950
          2       0.61      0.53      0.56       445
          3       0.58      0.45      0.51       542
          4       0.68      0.79      0.73       930
          5       0.58      0.52      0.55      1465

avg / total       0.63      0.64      0.63      4332

  (2778 / 4332) correct - 64.13%
    done in 1.480sec


In [None]:
testrecords = []

for x, value in np.ndenumerate(tAbstracts):
    testrecords.append(value[0:])
    
sd.fit(X_train_tf, np.array(cls))

In [47]:
d = []
for x, value in enumerate(testrecords):
    review_counts= X_train_counts.transform([value])
    review_transformed = tf_transformer.transform(review_counts)
    d.append(sd.predict(review_transformed)[0])

In [48]:
np.savetxt('output.dat', d, delimiter=" ", fmt="%s")