In [41]:
#Best Library for Regression

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os

%matplotlib inline

plt.rcParams['figure.figsize'] = [15, 8]
plt.style.use("fivethirtyeight")

import warnings
warnings.simplefilter('ignore')

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, TimeSeriesSplit
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import log_loss, accuracy_score
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Regression
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

In [42]:
import keras
import nltk
import pandas as pd
import numpy as np
import re
import codecs

In [43]:
def metric(y,y0):
    assert len(y)==len(y0)
    return 100-np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0),2)))

def metric_lgb(y_pred,data):
    y_true = data.get_label()
    result = metric(y_true,y_pred)
    
    return '100-rmse', result,True

In [44]:
train_1 = pd.read_csv("train.csv")
test_1 = pd.read_csv("test.csv")

# print("Train Shape : {}\nTest Shape: {}\nSub Shape: {}".format(train_1.shape, test_1.shape))

In [45]:
train = train_1.copy()
test = test_1.copy()

In [46]:
train.nunique()

patient_id                    32165
name_of_drug                   2220
use_case_for_drug               636
review_by_patient             30121
effectiveness_rating             10
drug_approved_by_UIC           3537
number_of_times_prescribed      303
base_score                     1383
dtype: int64

In [47]:
del train['patient_id']

In [48]:
train.head(3)

Unnamed: 0,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,8.022969
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969


In [49]:
# count=0
# li=[]
# for i in range(0,len(train['base_score'])+1):
#     if train['number_of_times_prescribed'][i]>50:
# #         print(f"{train['base_score'][i]}:::{train['review_by_patient'][i]}:::{train['number_of_times_prescribed'][i]}")
#         print(f"{train['base_score'][i]}")
#         print("===========================================")
#         if(train['effectiveness_rating'][i]<=8):
#               count+=1

In [50]:
# feartures = train.columns.to_list()
# feartures.remove('review_by_patient')
# for i in feartures:
#     plt.scatter(train[i],train['base_score'])
#     plt.xlabel(xlabel=i)
#     plt.show()

## Standardize

In [51]:
def standardize_test(df,text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df[text_field]
train['review_by_patient'] = standardize_test(train,'review_by_patient')
# train.head()

In [52]:
tfidf = TfidfVectorizer()
x= tfidf.fit_transform(train['review_by_patient'])

## Stemming
### Stemmer-> reduce word froms to proper class 
<p>Politicion,policy->politics</p>

In [60]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
train['stemmed'] = train.review_by_patient.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
train.head()

Unnamed: 0,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score,stemmed
0,Valsartan,Left Ventricular Dysfunction,"""it has no side effect, i take it in combinati...",9,20-May-12,27,8.022969,"""it has no side effect, i take it in combin of..."
1,Guanfacine,ADHD,"""my son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458,"""mi son is halfway through his fourth week of ..."
2,Lybrel,Birth Control,"""i used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969,"""i use to take anoth oral contraceptive, which..."
3,Buprenorphine / naloxone,Opiate Dependence,"""suboxone has completely turned my life around...",9,27-Nov-16,37,6.590176,"""suboxon has complet turn my life around i f..."
4,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,6.144782,"""2nd day on 5mg start to work with rock hard e..."


#### Scikit-learn provides two methods to get to our end result (a tf-idf weight matrix). One is a two-part process of using the CountVectorizer class to count how many times each term shows up in each document, followed by the TfidfTransformer class generating the weight matrix. The other does both steps in a single TfidfVectorizer class.

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

cvec = CountVectorizer(stop_words='english', min_df=0.0025, max_df=.5, ngram_range=(1,2))
cvec

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=0.0025,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [67]:
# Calculate all the n-grams found in all documents
from itertools import islice
cvec.fit(train.stemmed)
list(islice(cvec.vocabulary_.items(), 10))

[('effect', 812),
 ('combin', 547),
 ('mg', 1435),
 ('oil', 1597),
 ('mi', 1439),
 ('son', 2055),
 ('fourth', 984),
 ('week', 2433),
 ('becam', 379),
 ('concern', 561)]

In [68]:
# Check how many total n-grams we have
len(cvec.vocabulary_)

2566

In [69]:
cvec_counts = cvec.transform(train.stemmed)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (32165, 2566)
nonzero count: 1096806
sparsity: 1.33%


In [71]:
#Let’s look at the top 20 most common terms
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(10)

Unnamed: 0,term,occurrences
635,day,18751
1477,month,13250
2534,year,12352
812,effect,11948
2490,work,11824
2082,start,11491
2433,week,11264
1663,pain,10692
921,feel,10599
2228,time,9942


In [72]:
#Now that we’ve got term counts for each document we can use the TfidfTransformer to calculate the weights for each term in each document
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<32165x2566 sparse matrix of type '<class 'numpy.float64'>'
	with 1096806 stored elements in Compressed Sparse Row format>

In [74]:
#And we can take a look at the top 20 terms by average tf-idf weight:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(10)

Unnamed: 0,term,weight
635,day,0.035365
2490,work,0.029706
1663,pain,0.028515
812,effect,0.028091
1477,month,0.027941
2534,year,0.02676
2433,week,0.025064
921,feel,0.024512
2082,start,0.024467
2362,ve,0.022624
