In [None]:
import pandas as pd
import numpy as np

import os
print(os.listdir("../dataset"))

In [None]:
train_df = pd.read_csv('../dataset/train.csv', encoding = "ISO-8859-1",parse_dates=['Date(ET)'])
test_df = pd.read_csv('../dataset/test.csv', encoding = "ISO-8859-1", parse_dates=['Date(ET)'])

train_df.shape, test_df.shape

In [None]:
train_df[train_df.TRANS_CONV_TEXT.isna()==True]

In [None]:
train_df.drop(841, inplace=True)
train_df.isna().sum()

In [None]:
train_df = train_df[['TRANS_CONV_TEXT', 'Patient_Tag']]
test_df = test_df[['TRANS_CONV_TEXT']]

In [None]:
train_df.head()

In [None]:
from collections import Counter

label_counts = Counter(train_df['Patient_Tag'].values)
label_counts.most_common()

In [None]:
train_df.iloc[1]['TRANS_CONV_TEXT']

In [None]:
test_df.head()

In [None]:
def remove_html_tags(s):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', s)
    return cleantext

In [None]:
train_df['TRANS_CONV_TEXT'] = train_df['TRANS_CONV_TEXT'].apply(remove_html_tags)

In [None]:
test_df['TRANS_CONV_TEXT'] = test_df['TRANS_CONV_TEXT'].apply(remove_html_tags)

In [None]:
test_df.head()

In [None]:
train_df['TRANS_CONV_TEXT'].apply(len).describe()

In [None]:
test_df['TRANS_CONV_TEXT'].apply(len).describe()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

text_lens = train_df['TRANS_CONV_TEXT'].apply(len).values
fig = plt.figure()
fig.set_size_inches(15, 5)
g = sns.distplot(text_lens, kde=False, bins=250, color='red')
g.set_xlabel('Character length of reviews')
g.set_ylabel('Counts')
g.set_xticks(np.arange(-50, 1200, 50))
g.set_title('Review length distribution')
g.set_xlim(-50, 1200)
fig.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df['TRANS_CONV_TEXT'], \
                                                    train_df['Patient_Tag'], \
                                                    test_size=0.2, random_state=42)

In [None]:
label_counts = Counter(y_train.values)
label_counts.most_common()

In [None]:
label_counts = Counter(y_test.values)
label_counts.most_common()

## Baseline performance

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [None]:
vectorizer = CountVectorizer(stop_words=None, lowercase=True,
                             ngram_range=(1, 1), min_df=2, max_df=0.4, binary=True)

train_features = vectorizer.fit_transform(X_train)
train_labels = y_train

valid_features = vectorizer.transform(X_test)
valid_labels = y_test

In [None]:
model = BernoulliNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=2,max_df=0.7,use_idf=True)
train_features = tv.fit_transform(X_train)
train_labels = y_train

valid_features = tv.transform(X_test)
valid_labels = y_test

In [None]:
model = BernoulliNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(100, train_features.shape[0]))
selector.fit(train_features, train_labels)
train_features = selector.transform(train_features).astype('float32')
valid_features = selector.transform(valid_features).astype('float32')

In [None]:
model = BernoulliNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds_after_selection = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

## No improvement :(

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='l1', class_weight='balanced')
log_reg.fit(train_features, train_labels)

valid_preds_after_selection = log_reg.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

In [None]:
data = pd.concat([X_train, X_test])

full_train_features = tv.fit_transform(data)                                                
full_train_labels = train_df['Patient_Tag']

model = BernoulliNB(fit_prior=True)
model.fit(full_train_features, full_train_labels)

In [None]:
test_set_features = vectorizer.transform(test_df['TRANS_CONV_TEXT'])

test_preds = model.predict(test_set_features)
test_for_submission = pd.read_csv('../dataset/test.csv', encoding = "ISO-8859-1")
submission = pd.DataFrame()
submission['Index'] = test_for_submission['Index']
submission['Patient_Tag'] = test_preds

submission.to_csv('submission02.csv',index=False)

## To be continued

The idea of removing HTML tags did not do well it seems. 