<a href="https://colab.research.google.com/github/praneethrampur/NEWSBLOG/blob/master/DrugClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Reading the dataset**

In [0]:
import pandas as pd
df=pd.read_csv('drugclassify.csv')

In [0]:
df.drop(columns='unique_hash',inplace=True)

In [0]:
y=df['sentiment']
df.drop(columns=['sentiment','drug'],inplace=True)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

**Preprocessing**:
1.Removing all regular expressions
2.Tokenization
3.Stop word Removal
4.Stemmatization/Lemmatization


In [0]:
pd.set_option('display.width', 1000)
rx_pat = r"(\\r)|(\\n)|(\\t)|(\\f)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(&#039;@)|(\d\s)|(\d)|(\/)"
rx_pat_wSpace = r"(\-)|(\\)|(\s{2,})"
    
df['text'].replace(regex=True,inplace=True,to_replace=rx_pat, value=r'')
df['text'].replace(regex=True,inplace=True,to_replace=rx_pat_wSpace, value=r' ')
df.text.head(5)

0    Autoimmune diseases tend to come in clusters A...
1    I can completely understand why you’d want to ...
2    Interesting that it only targets SP receptors ...
3    Very interesting grand merci Now I wonder wher...
4    Hi everybody My latest MRI results for Brain a...
Name: text, dtype: object

In [0]:
df

Unnamed: 0,text
0,Autoimmune diseases tend to come in clusters A...
1,I can completely understand why you’d want to ...
2,Interesting that it only targets SP receptors ...
3,Very interesting grand merci Now I wonder wher...
4,Hi everybody My latest MRI results for Brain a...
...,...
5274,Hi Bee Thanks for the update and the good news...
5275,Have you had blood testing done to check your ...
5276,All the best to your husband and family
5277,Hi bazza luckily my eyes arent so badly affect...


In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stemmer=PorterStemmer()


def preprocessing(review):
    
    final_tokens=' '
    tokens=tokenizer.tokenize(review)
    #print('Tokens:',tokens)
    pure_tokens=[token.lower() for token in tokens if token.lower() not in stopwords.words('english')]
    #print('Pure Tokens:',pure_tokens)
    stemmed_tokens=[stemmer.stem(pure_token) for pure_token in pure_tokens]
    
    final_tokens=final_tokens.join(stemmed_tokens)
    
    return final_tokens


preprocessing('I was eating my breakfast when you were playing')
df['cleaned_text']=df['text'].apply(preprocessing)
df['cleaned_text']

0       autoimmun diseas tend come cluster gilenya fee...
1       complet understand want tri result report lect...
2       interest target sp receptor rather like fingol...
3       interest grand merci wonder lemtrada ocrevu sa...
4       hi everybodi latest mri result brain cervic co...
                              ...                        
5274    hi bee thank updat good news scan hard say eff...
5275    blood test done check level humira trough dose...
5276                                  best husband famili
5277    hi bazza luckili eye arent badli affect get he...
5278    well ms appear mild number year relaps take co...
Name: cleaned_text, Length: 5279, dtype: object

In [0]:
df.drop(columns='text',inplace=True)

In [0]:
df

Unnamed: 0,cleaned_text
0,autoimmun diseas tend come cluster gilenya fee...
1,complet understand want tri result report lect...
2,interest target sp receptor rather like fingol...
3,interest grand merci wonder lemtrada ocrevu sa...
4,hi everybodi latest mri result brain cervic co...
...,...
5274,hi bee thank updat good news scan hard say eff...
5275,blood test done check level humira trough dose...
5276,best husband famili
5277,hi bazza luckili eye arent badli affect get he...


**Vectorization**

In [0]:
# VECTORIZE IT (One Hot Encode It)
# Each word becomes one feature (column)
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(df['cleaned_text'])

# Define my X & create my matrix with n things and n features

X = cv.transform(df['cleaned_text'])



**Model Building**

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.7)
X_val.dtype

dtype('int64')

In [0]:
log=LogisticRegression()
log.fit(X_train,y_train)
log_predict=log.predict(X_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
compare=pd.DataFrame({'actuals':y_val,'predicts':log_predict})
compare

Unnamed: 0,actuals,predicts
200,2,2
4611,0,2
290,0,2
3080,2,1
1796,0,1
...,...,...
1844,2,0
2548,0,2
1941,2,2
901,2,2


In [0]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,roc_curve
confusion_matrix(y_val,log_predict)
print('accuarcy_score',accuracy_score(y_val,log_predict))
print('recall',recall_score(y_val,log_predict, pos_label='positive',average='weighted'))

print('precision:',precision_score(y_val,log_predict, pos_label='positive',average='weighted'))
#precision_score(y_test, y_pred, average=None)

accuarcy_score 0.6786616161616161
recall 0.6786616161616161
precision: 0.6567392968229612




In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
random=RandomForestClassifier()
random.fit(X_train,y_train)
random_predict=random.predict(X_val)


In [0]:
confusion_matrix(y_val,log_predict)
print('accuarcy_score',accuracy_score(y_val,random_predict))
print('recall',recall_score(y_val,random_predict, pos_label='positive',average='weighted'))

print('precision:',precision_score(y_val,random_predict, pos_label='positive',average='weighted'))
#precision_score(y_test, y_pred, average=None)

accuarcy_score 0.7367424242424242
recall 0.7367424242424242
precision: 0.6699070065736732




In [0]:
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier()
ada.fit(X_train,y_train)
ada_predict=ada.predict(X_val)

In [0]:
confusion_matrix(y_val,log_predict)
print('accuarcy_score',accuracy_score(y_val,ada_predict))
print('recall',recall_score(y_val,ada_predict, pos_label='positive',average='weighted'))

print('precision:',precision_score(y_val,ada_predict, pos_label='positive',average='weighted'))

accuarcy_score 0.7241161616161617
recall 0.7241161616161617
precision: 0.6330432072205794




In [0]:
from sklearn.ensemble import GradientBoostingClassifier
gradient=GradientBoostingClassifier()
gradient.fit(X_train,y_train)
gradient_predict=gradient.predict(X_val)

In [0]:
confusion_matrix(y_val,log_predict)
print('accuarcy_score',accuracy_score(y_val,gradient_predict))
print('recall',recall_score(y_val,gradient_predict, pos_label='positive',average='weighted'))

print('precision:',precision_score(y_val,gradient_predict, pos_label='positive',average='weighted'))

accuarcy_score 0.7380050505050505
recall 0.7380050505050505
precision: 0.6811443879163889




In [0]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier()
tree.fit(X_train,y_train)
tree_predict=tree.predict(X_val)

In [0]:
confusion_matrix(y_val,log_predict)
print('accuarcy_score',accuracy_score(y_val,tree_predict))
print('recall',recall_score(y_val,tree_predict, pos_label='positive',average='weighted'))

print('precision:',precision_score(y_val,tree_predict, pos_label='positive',average='weighted'))

accuarcy_score 0.6275252525252525
recall 0.6275252525252525
precision: 0.6099100596200978




In [0]:
from sklearn.svm import LinearSVC
svm=LinearSVC()
svm.fit(X_train,y_train)
svm_predict=svm.predict(X_val)



In [0]:
confusion_matrix(y_val,log_predict)
print('accuarcy_score',accuracy_score(y_val,svm_predict))
print('recall',recall_score(y_val,svm_predict, pos_label='positive',average='weighted'))

print('precision:',precision_score(y_val,svm_predict, pos_label='positive',average='weighted'))

accuarcy_score 0.6376262626262627
recall 0.6376262626262627
precision: 0.6427130355888421




In [0]:
import time
xgb = XGBClassifier(n_estimators=100)
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_val)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_val).sum().astype(float) / len(preds)
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

XGBoost's prediction accuracy is: 0.73
Time consumed for training: 11.178
Time consumed for prediction: 0.06424 seconds


In [0]:
results = pd.DataFrame({
    'Model': ['LogisticRegression', 'Random Forest','Adaboosting', 'Gradient boosting','Xg_Boost'],
    'Score': [accuracy_score(y_val,log_predict), accuracy_score(y_val,random_predict),accuracy_score(y_val,ada_predict), 
              accuracy_score(y_val,gradient_predict),acc_xgb]})
    
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Model')
result_df

Unnamed: 0_level_0,Score
Model,Unnamed: 1_level_1
Gradient boosting,0.738005
Random Forest,0.736742
Xg_Boost,0.733586
Adaboosting,0.724116
LogisticRegression,0.678662
