In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dfDrugs = pd.read_csv('data/drugsComTest.csv')
dfDrugs.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4


In [3]:
tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(dfDrugs['review'])

In [4]:
tfidf.vocabulary_

{'039': 25,
 've': 32006,
 'tried': 30813,
 'few': 12590,
 'antidepressants': 3679,
 'over': 21666,
 'the': 29934,
 'years': 33366,
 'citalopram': 7081,
 'fluoxetine': 13043,
 'amitriptyline': 3373,
 'but': 6081,
 'none': 20696,
 'of': 21181,
 'those': 30049,
 'helped': 14815,
 'with': 33013,
 'my': 20065,
 'depression': 9439,
 'insomnia': 16330,
 'amp': 3429,
 'anxiety': 3738,
 'doctor': 10377,
 'suggested': 28919,
 'and': 3505,
 'changed': 6711,
 'me': 18821,
 'onto': 21325,
 '45mg': 1639,
 'mirtazapine': 19502,
 'this': 30029,
 'medicine': 18911,
 'has': 14604,
 'saved': 26333,
 'life': 17863,
 'thankfully': 29919,
 'have': 14633,
 'had': 14413,
 'no': 20665,
 'side': 27146,
 'effects': 11068,
 'especially': 11714,
 'most': 19843,
 'common': 7679,
 'weight': 32716,
 'gain': 13552,
 'actually': 2645,
 'lost': 18262,
 'alot': 3221,
 'still': 28487,
 'suicidal': 28925,
 'thoughts': 30059,
 'son': 27796,
 'crohn': 8613,
 'disease': 10087,
 'done': 10429,
 'very': 32110,
 'well': 32746,


In [8]:
matrix.shape

(53766, 33626)

In [9]:
len(tfidf.vocabulary_)

33626

In [10]:
row = 10
col = tfidf.vocabulary_['illegal']

print('Message: "%s"' % dfDrugs.loc[row, 'review'])
print('TF-IDF score: %f' % matrix[row, col])

Message: ""Holy Hell is exactly how I feel. I had been taking Brisdelle for 1.5 years. The hot flashes did indeed subside - however, the side affects of this medicine coupled with the fact Noven was acquired by YET another pharmaceutical company - YOU CAN&#039;T PLACE A REP IN THE AREA, DISTRIBUTE YOUR DRUGS, AND THEN FIRE HER-AND NOT REPLACE THEREFORE there is NO medicine or support here. You dumped this drug in the Dr&#039;s hands and walked away. After calling Sebula - you act like you don&#039;t even care. You have made it impossible to obtain this. I happen to think this is illegal.  I just decided to wean myself off this and Premarin. It has been nothing short of a nightmare. If you don&#039;t need this drug- DON&#039;T START. Seriously.""
TF-IDF score: 0.149015


## Interpretation 
if a term has a high TF-IDF score, its presence across a set of documents (e.g. SMS messages) is low, while its number of occurrences in a given document (e.g. a candidate SMS message under evaluation) is high. If a term has a low TF-IDF score, this is an indicator that it doesn't appear very frequently in a given document, occurs very frequently across the set of documents, or both. We can exploit this information to find terms that can distinguish a certain set of documents (e.g. spam) from a larger set of documents 

In [11]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

In [12]:
sample = dfDrugs.sample(frac=0.3, random_state=0)  # Randomly subsample a quarter of the available data

X = sample['review']
y = sample['usefulCount']

In [None]:
KNeighborsClassifier().get_params()

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    KNeighborsClassifier()
)

# Build models for different values of n_neighbors (k), distance metric and weight scheme
parameters = {
    'kneighborsclassifier__n_neighbors': [2, 5],
    'kneighborsclassifier__metric': ['manhattan', 'euclidean'],
    'kneighborsclassifier__weights': [ 'distance']
}

# Use inner CV to select the best model
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # K = 5

clf = GridSearchCV(pipeline, parameters, cv=inner_cv, n_jobs=-1)  # n_jobs=-1 uses all available CPUs = faster
clf.fit(X, y)

# Use outer CV to evaluate the error of the best model
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)  # K = 10, doesn't have to be the same
y_pred = cross_val_predict(clf, X, y, cv=outer_cv)

print(classification_report(y, y_pred))  # Print the classification report