In [27]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report

from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer

import nltk

# Load data

In [2]:
df = pd.read_csv('data/Sentences_clean.csv')

In [4]:
df.head()

Unnamed: 0,ID,Sentence_clean,sentiment
0,1,results nd line treatment show orr patient...,Positive
1,2,long duration response high durable response r...,Positive
2,4,therefore clinical benefit nd line treatment ...,Positive
3,5,data provided st line although preliminary sh...,Positive
4,6,taking account intrinsic limitation single arm...,Positive


In [10]:
# create stratified train test split 

train, test = train_test_split(df, test_size=0.2, train_size=0.8, stratify=df['sentiment'])

In [17]:
# encode target

le = LabelEncoder()

le.fit(train['sentiment'])

train['target'] = le.transform(train['sentiment'])
test['target'] = le.transform(test['sentiment'])

In [57]:
le.classes_

array(['Negative', 'Neutral', 'Positive'], dtype=object)

# Create features

In [12]:
# create bag of words

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Sentence_clean'])
test_matrix = vectorizer.transform(test['Sentence_clean'])

<187x1073 sparse matrix of type '<class 'numpy.int64'>'
	with 2585 stored elements in Compressed Sparse Row format>

# Train simple logistic regression model

In [19]:
X_train = train_matrix
X_test = test_matrix
y_train = train['target']
y_test = test['target']

In [21]:
# logistic regression model

lr = LogisticRegression()

# fit
lr.fit(X_train,y_train)

LogisticRegression()

In [23]:
predictions = lr.predict(X_test)

In [53]:
y_train.unique()

array([2, 1, 0])

In [54]:
# find accuracy, precision, recall:

confusion_matrix(predictions,y_test, labels=[0,1,2])

array([[ 2,  0,  0],
       [ 0,  7,  2],
       [ 4,  6, 26]])

In [52]:
test.groupby('sentiment')['ID'].count()

sentiment
Negative     6
Neutral     13
Positive    28
Name: ID, dtype: int64

In [26]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       0.33      1.00      0.50         2
           1       0.54      0.78      0.64         9
           2       0.93      0.72      0.81        36

    accuracy                           0.74        47
   macro avg       0.60      0.83      0.65        47
weighted avg       0.83      0.74      0.77        47



In [45]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train.toarray(), y_train)

GaussianNB()

In [47]:
# Predict Class
y_pred = classifier.predict(X_test.toarray())

In [49]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.33      0.50      0.40         4
           1       0.31      0.40      0.35        10
           2       0.82      0.70      0.75        33

    accuracy                           0.62        47
   macro avg       0.49      0.53      0.50        47
weighted avg       0.67      0.62      0.64        47

