## Referenced from https://github.com/CharlesRajendran/TextClassification

### Example notebook for Bag of Words text classification using nltk package. 
Example task: predict deception ("Straightforward" or "Cassandra") using input text.

### Notes:
- This is a ML method, not NN or DL.
- The author has a medium page that explains the code: https://medium.com/swlh/text-classification-using-the-bag-of-words-approach-with-nltk-and-scikit-learn-9a731e5c4e2f

In [14]:
import pandas as pd
from tqdm.notebook import tqdm

import re
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt 

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

#spell correction
from autocorrect import Speller
spell = Speller()

[nltk_data] Downloading package punkt to /home/kaiyuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kaiyuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
dataset = pd.read_csv('./data/kokil dec 6 reprepare/conf_pc_worker_sem.csv')

In [15]:
data = []

for i in tqdm(range(dataset.shape[0])):
    input_text = dataset.iloc[i, 6]

    # remove non alphabatic characters
    input_text = re.sub('[^A-Za-z]', ' ', input_text)

    # make words lowercase, because Go and go will be considered as two words
    input_text = input_text.lower()

    # tokenising
    tokenized_input_text = wt(input_text)

    # remove stop words and stemming
 
    input_text_processed = []
    for word in tokenized_input_text:
        if word not in set(stopwords.words('english')):
            input_text_processed.append(spell(stemmer.stem(word)))

    input_text = " ".join(input_text_processed)
    data.append(input_text)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13195.0), HTML(value='')))




In [18]:
# creating the feature matrix 
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(data).toarray()
y = dataset.iloc[:, 18]

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# predict class
y_pred = classifier.predict(X_test)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

In [19]:
accuracy

0.4198241891482267

In [22]:
print(cr)

                 precision    recall  f1-score   support

      Cassandra       0.05      0.58      0.10       180
Straightforward       0.94      0.41      0.57      3119

       accuracy                           0.42      3299
      macro avg       0.50      0.50      0.34      3299
   weighted avg       0.90      0.42      0.55      3299

