#### Imports

In [None]:
import sys
import os
import re
import pandas as pd
import numpy as np
import os.path as path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from scipy.stats import itemfreq
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import stop_words
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt

In [None]:
# plot settings
plt.rc('font', family='serif', size = 10)
plt.rc('figure', figsize=(15,8))
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
plt.rc('axes', titlesize=32, labelsize=25) 
plt.rc('legend', fontsize=18) 
plt.rc('axes', titlepad=15, labelpad=15, grid=True, titleweight='normal', labelweight='normal')
plt.rc('grid', linestyle='dashed', linewidth=0.5)

#### Data Read

In [None]:
og_location = os.path.abspath(os.path.realpath(os.path.join(os.getcwd(), '..\Data')))
# reading train and test files
Train = pd.read_csv(og_location + "\drugsComTrain_raw.csv", encoding= "utf-8")
Test = pd.read_csv(og_location + "\drugsComTest_raw.csv", encoding= "utf-8")

#### Pre-processing

In [None]:
# merging train and test data for pre-processing
merged_data = pd.concat([Train,Test],ignore_index=True)
# remove NAs and drop duplicates
merged_data = merged_data.dropna(axis=0).drop_duplicates()
# remove certain condition that are useless
span_data = merged_data[merged_data['condition'].str.contains('</span>',case=False,regex=True) == True]
merged_data.drop(span_data.index, axis = 0, inplace=True)

In [None]:
# function to remove html characters from the data
def remove_html(raw_review):
    # 1. Delete HTML 
    review_text = ''.join(BeautifulSoup(raw_review, 'html.parser').get_text())
    return (review_text)

In [None]:
review_text = merged_data['review'].apply(remove_html)
# remove special characters
review_text = review_text.map(lambda x: re.sub(r'[^\w]', ' ', x))  
# 3. remove added white spaces
review_text = review_text.map(lambda x: re.sub("\s\s+", " ", x))
# 4. remove delimiters 
X = review_text.map(lambda x: x.replace("\n",'').replace("\r",'').replace("\t", '')).values

#### Rating Distribution

In [None]:
plt.hist(merged_data['rating'], normed = True)
plt.show()

In [None]:
# Percentage distribution of data 
dist = pd.DataFrame(round(merged_data.groupby(['rating']).count()['uniqueID']/len(merged_data) * 100,3))

In [None]:
dist.columns = ['perc']

In [None]:
dist

#### Skewed data check for rating and creating target labels

In [None]:
# creating target labels
y = merged_data['rating'].apply(lambda x: -1 if x < 3 else x)
y = y.apply(lambda x: 0 if (2 < x < 9) else x)
y = y.apply(lambda x: 1 if  x > 8 else x).values

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#### Min_df  Max_df value select

In [None]:
def min_vectorizers():
    min_df = []
    for i in range(1,11):
        vect = CountVectorizer(encoding='latin-1', lowercase = True, binary=True, min_df = i, stop_words='english')
        vecs_count = vect.fit_transform(X_train)
        min_df.append(vecs_count.shape[1])
    return min_df

def max_vectorizers():
    max_df = []
    for i in range(1,11):
        vect = CountVectorizer(encoding='latin-1', lowercase = True, binary=True, max_df = i/10, stop_words='english')
        vecs_count = vect.fit_transform(X_train)
        max_df.append(vecs_count.shape[1])
    return max_df

In [None]:
max_df = max_vectorizers()
min_df = min_vectorizers()

In [None]:
plt.plot(np.linspace(10,100,10), max_df)
plt.ylabel('Vocabulary size')
plt.xlabel('Max_df %')
plt.show()

In [None]:
plt.plot(np.linspace(1,10,10), min_df)
plt.ylabel('Vocabulary size')
plt.xlabel('Min_DF')
plt.show()

In [None]:
# declare vectorizer
count_vectorizer = CountVectorizer(encoding='latin-1', binary = True, lowercase = True, min_df = 3, max_df = 0.7, ngram_range = (1,2))#, stop_words=stop_list)
# vocabulary creation
X_train_vec = count_vectorizer.fit_transform(X_train)
X_test_vec = count_vectorizer.transform(X_test)
print(len(count_vectorizer.vocabulary_))
print("Train Data",X_train_vec.shape)
print("Test Data",X_test_vec.shape)

#### stopwords experimentation

In [None]:
stop = list(stop_words.ENGLISH_STOP_WORDS)
vocab = list(count_vectorizer.vocabulary_.items())
stop_list = []
for i in range(0, len(vocab)):
    for j in range(0, len(stop)):
        if vocab[i][0] == stop[j]:
            if vocab[i][1] > np.ceil(len(vocab) * 0.7):
                #print(vocab[i])
                stop_list.append(vocab[i][0])
stop_list = frozenset(stop_list)  

In [None]:
# declare vectorizer
count_vectorizer = CountVectorizer(encoding='latin-1', binary = True, lowercase = True, min_df = 3, max_df = 0.7, ngram_range = (1,2), stop_words=stop_list)
# vocabulary creation
X_train_vec = count_vectorizer.fit_transform(X_train)
X_test_vec = count_vectorizer.transform(X_test)
print(len(count_vectorizer.vocabulary_))
print("Train Data",X_train_vec.shape)
print("Test Data",X_test_vec.shape)

#### Models

In [None]:
# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

In [None]:
svm_clf.score(X_test_vec,y_test)

In [None]:
# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

#### Use Boolean vectorizer

In [None]:
bernoulliNB_clf = BernoulliNB()
bernoulliNB_clf.fit(X_train_vec,y_train)

In [None]:
np.round(nb_clf.score(X_test_vec,y_test) * 100, 3)

In [None]:
np.round(bernoulliNB_clf.score(X_test_vec,y_test) * 100, 3)

#### Predictions

In [None]:
y_pred_B = bernoulliNB_clf.predict(X_test_vec)
y_pred_mnb = nb_clf.predict(X_test_vec)
y_pred_svm = svm_clf.predict(X_test_vec)

In [None]:
# Cosine similarity
cos_sim = cosine_similarity([y_pred_B,y_pred_mnb,y_pred_svm])
print(cos_sim)

#### Confusion matrix and F-scores

##### SVM

In [None]:
target_labels = [-1,0,1]
target_names = ['-1','0','1']

In [None]:
confusion_matrix(y_test, y_pred_svm, labels= target_labels)

In [None]:
print(classification_report(y_test, y_pred_svm, target_names=target_names))

In [None]:
# Extreme misclassification percentage
(confusion_matrix(y_test, y_pred_svm, labels= target_labels)[0][2] + confusion_matrix(y_test, y_pred_svm, labels= target_labels)[2][0])/len(y_test) * 100

##### MNB

In [None]:
confusion_matrix(y_test, y_pred_mnb, labels= target_labels)

In [None]:
print(classification_report(y_test, y_pred_mnb, target_names=target_names))

In [None]:
# Extreme misclassification percentage
(confusion_matrix(y_test, y_pred_mnb, labels= target_labels)[0][2] + confusion_matrix(y_test, y_pred_mnb, labels= target_labels)[2][0])/len(y_test) * 100

##### BNB

In [None]:
confusion_matrix(y_test, y_pred_B, labels= target_labels)

In [None]:
print(classification_report(y_test, y_pred_B, target_names=target_names))

In [None]:
# Extreme misclassification percentage
(confusion_matrix(y_test, y_pred_B, labels= target_labels)[0][2] + confusion_matrix(y_test, y_pred_B, labels= target_labels)[2][0])/len(y_test) * 100

In [None]:
# Common_list in MNB, BNB,  for prediction class 1 and wrongly predicted as -1
err_cnt = 0

for i in range(0, len(y_test)):
    if((y_test[i]==1) and (y_pred_svm[i]==-1) and (y_pred_mnb[i] == -1) and (y_pred_B[i] == -1)):
        print(X_test[i])
        print('-----',i)

        err_cnt = err_cnt+1
print("errors:", err_cnt)

In [None]:
# Common_list in MNB, BNB, for prediction class -1 and wrongly predicted as 1
err_cnt = 0

for i in range(0, len(y_test)):
    if((y_test[i]== -1) and (y_pred_svm[i]==1) and (y_pred_mnb[i] == 1) and (y_pred_B[i] == 1)):
        print(X_test[i])
        print('-----',i)

        err_cnt = err_cnt+1
print("errors:", err_cnt)  

In [None]:
negative_ranks = sorted(zip(svm_clf.coef_[-1], count_vectorizer.get_feature_names()))
top_neg= []
## get the 10 features that are best indicators of negative sentiment 
negative_10 = negative_ranks[-50:]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
    top_neg.append(negative_10[i][1])

In [None]:
neutral_ranks = sorted(zip(svm_clf.coef_[0], count_vectorizer.get_feature_names()))
top_neu= []
## get the 10 features that are best indicators of neutral sentiment
neutral_10 = neutral_ranks[-50:]
print("Neutral words")
for i in range(0, len(neutral_10)):
    print(neutral_10[i])
    top_neu.append(neutral_10[i][1])

In [None]:
positive_ranks = sorted(zip(svm_clf.coef_[1], count_vectorizer.get_feature_names()))
top_pos = []
## get the 10 features that are best indicators of neutral sentiment
positive_10 = positive_ranks[-50:]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
    top_pos.append(positive_10[i][1])

In [None]:
k = (' '.join(top_pos))
wordcloud = WordCloud(width = 1000, height = 500).generate(k)
plt.figure(figsize=(15, 10))
plt.imshow(wordcloud)
plt.axis('off');
plt.show()

#### Checking for binary classifiers

In [None]:
# declare vectorizer
count_vectorizer = CountVectorizer(encoding='latin-1', lowercase = True, min_df = 3, max_df = 0.7, ngram_range = (1,2), stop_words=stop_list)
class_list = []
for i in range(2,11):
    class_label = i
    # creating target labels
    y = merged_data['rating'].apply(lambda x: 0 if x == class_label else 1)
    
    # dividing train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    
    # vocabulary creation
    X_train_vec = count_vectorizer.fit_transform(X_train)
    X_test_vec = count_vectorizer.transform(X_test)
    
    # initialize the LinearSVC model
    svm_clf = LinearSVC(C=1)
    
    # use the training data to train the model
    svm_clf.fit(X_train_vec,y_train)
    
    # finding top words for the respective class
    feature_ranks = sorted(zip(svm_clf.coef_[0], count_vectorizer.get_feature_names()))
    
    class_words = feature_ranks[-50:]
    
    class_list_temp = []
    for i in range(0, len(class_words)):
        class_list_temp.append(class_words[i][1])
    
    class_list.append(class_list_temp)

#### Ambiguity based on conjunction

In [None]:
conjunction_list= ['and','but','moreover','in addition','as long as','only if','when','in case','assumption','additionally','further','furthermore','along with','as well as','also','plus','if','unless','even if','even until']

In [None]:
mystring = ['I am going and found this but cannot go now', 'moreover i am going when i would be good just in case if i dont']

In [None]:
conj_count_list = []
for i in range(0, len(X)):
    conj_count = 0
    for j in conjunction_list:
        if X[i].find(j) != -1:
            conj_count +=1
    conj_count_list.append(conj_count)
conj_df = {'conj_count': conj_count_list, 'y': y}
conj_df = pd.DataFrame(data = conj_df)    
conj_df.groupby('y').mean()

In [None]:
conj_count_list = []
for i in range(0, len(X_test)):
    conj_count = 0
    if((y_test[i]== -1) and (y_pred_svm[i]==1) and (y_pred_mnb[i] == 1) and (y_pred_B[i] == 1)):
        for j in conjunction_list:
            if X_test[i].find(j) != -1:
                conj_count +=1
    conj_count_list.append(conj_count)

In [None]:
for i in range(0, len(X_test)):
    if((y_test[i]== -1) and (y_pred_svm[i]==1) and (y_pred_mnb[i] == 1) and (y_pred_B[i] == 1)):
        print(conj_count_list[i])
        print(X_test[i])
        print('----------------------------------',i)

In [None]:
conj_count_list = []
for i in range(0, len(X_test)):
    conj_count = 0
    if((y_test[i]== 1) and (y_pred_svm[i]== -1) and (y_pred_mnb[i] == -1) and (y_pred_B[i] == -1)):
        for j in conjunction_list:
            if X_test[i].find(j) != -1:
                conj_count +=1
    conj_count_list.append(conj_count)

In [None]:
for i in range(0, len(X_test)):
    if((y_test[i]== 1) and (y_pred_svm[i]== -1) and (y_pred_mnb[i] == -1) and (y_pred_B[i] == -1)):
        print(conj_count_list[i])
        print(X_test[i])
        print('----------------------------------',i)