# Mount Google Drive

Due to convenience and fast downloading, I upload the raw data and fast-text .bin to google drive. Mounting these resources are as follows:


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Required Libraries

In [None]:
!pip3 --no-cache-dir install dadmatools numpy pandas matplotlib plotly scikit-learn hazm wordcloud_fa nltk wandb 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dadmatools
  Downloading dadmatools-1.5.2-py3-none-any.whl (862 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m862.6/862.6 KB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.7/316.7 KB[0m [31m290.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wordcloud_fa
  Downloading wordcloud_fa-0.1.10-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.4/71.4 KB[0m [31m214.0 MB/s[0m eta [36m0:00:00[0m
Collecting wandb
  Downloading wandb-0.13.9-py2.py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m127.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperopt>=0.2.5
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━

# Import Required Functionalities

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from tqdm import tqdm
import itertools
from collections import defaultdict
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier, Perceptron, LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC, SVC, LinearSVR, SVR
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.utils.extmath import density
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from __future__ import unicode_literals
from hazm import Normalizer, sent_tokenize, word_tokenize, Stemmer, Lemmatizer, POSTagger, Chunker, tree2brackets, DependencyParser, stopwords_list
from nltk.util import pad_sequence, bigrams, ngrams, everygrams

sns.set_theme(style="white")
%matplotlib inline

# Read Raw Data

In [None]:
import numpy as np
import pandas as pd
pd.options.plotting.backend = "matplotlib"

import matplotlib.pyplot as plt

labelled_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SAMousavizade/data_labelled.csv")
unlabelled_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SAMousavizade/data_unlabelled.csv")

labelled_data

Unnamed: 0,id,text,label
0,131244574,عالی عالی عالی عالی عالی عالی عالی عالی عالی ع...,0
1,133141894,دوستان این نظرات و پیشنهادات رو باید به پشتیبا...,0
2,94129372,خیلی ایراد داره مسخره تر از این نمیشه رقیب‌هات...,1
3,131334220,نه کی گفته خرابه من دارم باهاش کار میکنم از من...,0
4,131387172,سلام عالیه حتما نصب کنید از کالاف دیوتی هم بهتره,0
...,...,...,...
3591,94229465,همه رشته ها نداره مثلا معارف سوالات تخصصي ندار...,1
3592,131571104,خیلی بده من اصلا. دوست ندارم خواهش می کنم دانل...,0
3593,132784715,بهترین برنا مه ای که دیدم خیلی باهاله میتونی ت...,0
3594,131981378,خیلی بازی مسخره هس نصب نکنید ۱ستاره هم براش زیاده,0


# Preprocess Raw Data

In this section, I preprocess raw-text data. Text preprocessing steps are as follows:

- Unifying all variants of characters (like "ی" and "ي")
- Remove extra spaces between tokens 
- Remove punctuations(like !, ., ?, etc)
- Remove HTML tags  
- Remove all emails, phone numbers, URLs, emojis
- Remove stop-words 
-  Refine any characters being repeated more than 2 times in the tokens (like "عااااااااالیه" to "عاالیه"
- Lemmatization (grouping the inflected forms of a word so they can be analyzed as a single item.)

In [None]:
from dadmatools.models.normalizer import Normalizer

normalizer = Normalizer(
    full_cleaning=False,
    unify_chars=True,
    refine_punc_spacing=True,
    remove_extra_space=True,
    remove_puncs=True,
    remove_html=True,
    remove_stop_word=True,
    replace_email_with="",
    replace_number_with="",
    replace_url_with="",
    replace_mobile_number_with="",
    replace_emoji_with="",
    replace_home_number_with=""
)

labelled_data["normalized_text"] = labelled_data["text"].apply(lambda text: normalizer.normalize(text))
labelled_data["normalized_text"] = labelled_data["normalized_text"].replace(r'(.)\1{2,}', '', regex=True)

unlabelled_data["normalized_text"] = unlabelled_data["text"].apply(lambda text: normalizer.normalize(text))
unlabelled_data["normalized_text"] = unlabelled_data["normalized_text"].replace(r'(.)\1{2,}', '', regex=True)


In [None]:
labelled_data

Unnamed: 0,id,text,label,normalized_text
0,131244574,عالی عالی عالی عالی عالی عالی عالی عالی عالی ع...,0,عالی عالی عالی عالی عالی عالی عالی عالی عالی ع...
1,133141894,دوستان این نظرات و پیشنهادات رو باید به پشتیبا...,0,دوستان نظرات پیشنهادات پشتیبانی تیکت اسنپ بفرس...
2,94129372,خیلی ایراد داره مسخره تر از این نمیشه رقیب‌هات...,1,ایراد داره مسخره نمیشه رقیب‌هات روح ماشینت رد ...
3,131334220,نه کی گفته خرابه من دارم باهاش کار میکنم از من...,0,خرابه باهاش کار رایگانه ممنون سازنده ❤ ستاره کمه
4,131387172,سلام عالیه حتما نصب کنید از کالاف دیوتی هم بهتره,0,سلام عالیه نصب کالاف دیوتی بهتره
...,...,...,...,...
3591,94229465,همه رشته ها نداره مثلا معارف سوالات تخصصي ندار...,1,نداره معارف سوالات تخصصی نداره حیف کنین دیگ خو...
3592,131571104,خیلی بده من اصلا. دوست ندارم خواهش می کنم دانل...,0,بده دوست خواهش دانلود بازی قرآنی بازی ستاره ستاره
3593,132784715,بهترین برنا مه ای که دیدم خیلی باهاله میتونی ت...,0,برنا مه دیدم باهاله میتونی تنظیم عصلا عالی
3594,131981378,خیلی بازی مسخره هس نصب نکنید ۱ستاره هم براش زیاده,0,بازی مسخره هس نصب 1ستاره براش


In [None]:
from hazm import Normalizer, sent_tokenize, word_tokenize, Stemmer, Lemmatizer, POSTagger, Chunker, tree2brackets, DependencyParser, stopwords_list

lemmatizer = Lemmatizer()

labelled_data["lemmatized_tokens"] = labelled_data["normalized_text"].apply(lambda text: " ".join(list(map(lemmatizer.lemmatize, word_tokenize(text)))))

test_data = [
    "!!!!سلام برنامه خوبیه جدا",
    "لود نمیشه اصلا!! :((((",
    "پولم رو پس نمیدید چرا؟؟؟",
    "بازی جالبیه.",
    "خیلییییی لگ داره روی گوشیم.",
    "معتاد این بازی شدم.",
    "خیلی باگ داره اعصابو خورد کرده.",
    "بازی توی مرحله اول گیر کرده و به مرحله بعدی نمیره اصلا! :(((",
    "آقا عالیه!!!!",
    "موقع نصب به مشکل میخوره. اه.",
    "آشغااااااااااااااله",
    "افتضاحهههههه.",
    "مزخرفه.",
    "همش باگ میخورههههههههههههههههههههه.",
    "برای بچه ها مشکل داره این بازی. لطفا اینو ذکر کنید."
]

test_lemmatized_tokens = [" ".join(list(map(lemmatizer.lemmatize, word_tokenize(text)))) for text in test_data]

# Extract Feature Vectors

In this section, I extract **TF-IDF** embedding vectors to represent each document as a vector of floats.

In [None]:
train_data = labelled_data

vectorizer = TfidfVectorizer(ngram_range=(1, 4), token_pattern=r'\w{1,}',)
x_train_validation = vectorizer.fit_transform(train_data["lemmatized_tokens"])
y_train_validation = train_data["label"]

x_train, x_validation, y_train, y_validation = train_test_split(x_train_validation, y_train_validation, test_size=0.1, random_state=42)

x_test = vectorizer.transform(test_lemmatized_tokens)

# Login to WandB 

In [None]:
import os

os.environ["WANDB_API_KEY"] = "1d6bdaf3f9f088abf0915e5e5cb6689e4c7e7476"
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msamousavizade[0m ([33mcausal-inference[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

# Train Classifiers

Train classic ML models on the feature vectors extracted previously.

In [None]:
from time import time

def benchmark(clf):
    print("_" * 80)
    print("Training: ")
    print(clf)
    clf.fit(x_train, y_train)

    print("Validation: ")
    pred = clf.predict(x_validation)
    try:
      pred_proba = clf.predict_proba(x_validation)
    except:
      pred_proba = np.vstack((1 - pred, pred)).T

    print("classification report:")
    print(classification_report(y_validation, pred, target_names=['Non-Informative', 'Informative']))

    print("confusion matrix:")
    print(confusion_matrix(y_validation, pred))
    
    model_name = str(clf).split("(")[0]

    wandb.init(
        project="CoffeeBazaarSeqClassification",
        entity="samousavizade",
        name=model_name,
        config=clf.get_params()
    )

    from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score 

    roc_auc = roc_auc_score(y_validation, pred, average="macro")
    precision = precision_score(y_validation, pred, average="macro")
    recall = recall_score(y_validation, pred, average="macro")
    f1 = f1_score(y_validation, pred, average="macro")

    wandb.log({
        "roc_auc": roc_auc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    })

    wandb.sklearn.plot_learning_curve(clf, x_train, y_train)
    wandb.sklearn.plot_roc(y_validation, pred_proba, ["Non-Informative", "Informative"])
    wandb.sklearn.plot_precision_recall(y_validation, pred_proba, ["Non-Informative", "Informative"])
    wandb.sklearn.plot_confusion_matrix(y_validation, pred, ["Non-Informative", "Informative"])
    wandb.sklearn.plot_summary_metrics(clf, x_train, y_train, x_validation, y_validation)

    wandb.finish()

    print()
    clf_descr = model_name 
    return clf_descr

results = []
for clf, name in (
    (RidgeClassifier(tol=1e-2, solver="auto"), "Ridge Classifier"),
    (Perceptron(max_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(), "Random forest"),
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2"]:
    print("=" * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=500, penalty=penalty)))

# Train SGD with Elastic Net penalty
print("=" * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=0.0001, max_iter=500, penalty="elasticnet"))
)

# Train NearestCentroid without threshold
print("=" * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train SVM
print("=" * 80)
print("SVM")
results.append(benchmark(SVC()))

# Train sparse Naive Bayes classifiers
print("=" * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=0.01)))
results.append(benchmark(BernoulliNB(alpha=0.01)))
results.append(benchmark(ComplementNB(alpha=0.1)))

print("=" * 80)

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(tol=0.01)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.75      0.94      0.84       254
    Informative       0.64      0.26      0.37       106

       accuracy                           0.74       360
      macro avg       0.69      0.60      0.60       360
   weighted avg       0.72      0.74      0.70       360

confusion matrix:
[[238  16]
 [ 78  28]]


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.60421
precision,0.69476
recall,0.60058
roc_auc,0.60058


0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.60421
precision,0.69476
recall,0.60058
roc_auc,0.60058



Perceptron
________________________________________________________________________________
Training: 
Perceptron(max_iter=50)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.80      0.85      0.82       254
    Informative       0.58      0.49      0.53       106

       accuracy                           0.74       360
      macro avg       0.69      0.67      0.68       360
   weighted avg       0.73      0.74      0.74       360

confusion matrix:
[[216  38]
 [ 54  52]]


VBox(children=(Label(value='0.007 MB of 0.020 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.350631…

0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.67752
precision,0.68889
recall,0.67048
roc_auc,0.67048



Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(max_iter=50)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.80      0.91      0.85       254
    Informative       0.67      0.46      0.55       106

       accuracy                           0.78       360
      macro avg       0.74      0.68      0.70       360
   weighted avg       0.76      0.78      0.76       360

confusion matrix:
[[230  24]
 [ 57  49]]


0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.69888
precision,0.73631
recall,0.68389
roc_auc,0.68389



kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(n_neighbors=10)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.71      1.00      0.83       254
    Informative       0.00      0.00      0.00       106

       accuracy                           0.71       360
      macro avg       0.35      0.50      0.41       360
   weighted avg       0.50      0.71      0.58       360

confusion matrix:
[[254   0]
 [106   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.41368
precision,0.35278
recall,0.5
roc_auc,0.5



Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier()
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.74      0.97      0.84       254
    Informative       0.72      0.20      0.31       106

       accuracy                           0.74       360
      macro avg       0.73      0.58      0.58       360
   weighted avg       0.74      0.74      0.68       360

confusion matrix:
[[246   8]
 [ 85  21]]


0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.57607
precision,0.73367
recall,0.58331
roc_auc,0.58331



L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(dual=False, tol=0.001)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.77      0.93      0.85       254
    Informative       0.69      0.35      0.46       106

       accuracy                           0.76       360
      macro avg       0.73      0.64      0.65       360
   weighted avg       0.75      0.76      0.73       360

confusion matrix:
[[237  17]
 [ 69  37]]


VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.971856…

0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.65446
precision,0.72985
recall,0.64106
roc_auc,0.64106



________________________________________________________________________________
Training: 
SGDClassifier(max_iter=500)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.78      0.91      0.84       254
    Informative       0.65      0.40      0.49       106

       accuracy                           0.76       360
      macro avg       0.71      0.65      0.67       360
   weighted avg       0.74      0.76      0.74       360

confusion matrix:
[[231  23]
 [ 64  42]]


0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.66638
precision,0.7146
recall,0.65284
roc_auc,0.65284



Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(max_iter=500, penalty='elasticnet')
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.79      0.91      0.85       254
    Informative       0.66      0.42      0.52       106

       accuracy                           0.77       360
      macro avg       0.73      0.67      0.68       360
   weighted avg       0.75      0.77      0.75       360

confusion matrix:
[[231  23]
 [ 61  45]]


VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.969792…

0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.6817
precision,0.72643
recall,0.66699
roc_auc,0.66699



NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid()
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.90      0.55      0.68       254
    Informative       0.44      0.85      0.58       106

       accuracy                           0.64       360
      macro avg       0.67      0.70      0.63       360
   weighted avg       0.76      0.64      0.65       360

confusion matrix:
[[139 115]
 [ 16  90]]


0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.62924
precision,0.6679
recall,0.69815
roc_auc,0.69815



SVM
________________________________________________________________________________
Training: 
SVC()
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.72      0.99      0.83       254
    Informative       0.78      0.07      0.12       106

       accuracy                           0.72       360
      macro avg       0.75      0.53      0.48       360
   weighted avg       0.74      0.72      0.62       360

confusion matrix:
[[252   2]
 [ 99   7]]


0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.4774
precision,0.74786
recall,0.52908
roc_auc,0.52908



Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.90      0.52      0.66       254
    Informative       0.43      0.87      0.58       106

       accuracy                           0.62       360
      macro avg       0.67      0.70      0.62       360
   weighted avg       0.77      0.62      0.64       360

confusion matrix:
[[133 121]
 [ 14  92]]


0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.62007
precision,0.66834
recall,0.69577
roc_auc,0.69577



________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.85      0.72      0.78       254
    Informative       0.50      0.69      0.58       106

       accuracy                           0.71       360
      macro avg       0.67      0.70      0.68       360
   weighted avg       0.75      0.71      0.72       360

confusion matrix:
[[182  72]
 [ 33  73]]


VBox(children=(Label(value='0.023 MB of 0.023 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.6789
precision,0.67498
recall,0.70261
roc_auc,0.70261



________________________________________________________________________________
Training: 
ComplementNB(alpha=0.1)
Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.96      0.43      0.59       254
    Informative       0.41      0.95      0.57       106

       accuracy                           0.58       360
      macro avg       0.68      0.69      0.58       360
   weighted avg       0.79      0.58      0.58       360

confusion matrix:
[[108 146]
 [  5 101]]


VBox(children=(Label(value='0.023 MB of 0.035 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.647658…

0,1
f1,▁
precision,▁
recall,▁
roc_auc,▁

0,1
f1,0.5804
precision,0.68233
recall,0.68901
roc_auc,0.68901





According to the results, I choose the model with the highest value of ROC-AUC, ie the SGDClassifier model.

# Predict On Test Data 

Load best model (according to HPO process) and predict test cases labels. 

In [None]:
clf = BernoulliNB(alpha=0.01)

clf.fit(x_train, y_train)
y_test_predicted = clf.predict(x_test)

print("Validation: ")
pred = clf.predict(x_validation)
try:
  pred_proba = clf.predict_proba(x_validation)
except:
  pred_proba = np.vstack((1 - pred, pred)).T

print("classification report:")
print(classification_report(y_validation, pred, target_names=['Non-Informative', 'Informative']))

print("confusion matrix:")
print(confusion_matrix(y_validation, pred))


for i in range(len(test_data)):
  print("Test Case Text:")
  print(test_data[i])
  print("Prediction:")
  print(y_test_predicted[i])
  print("*" * 80)



Validation: 
classification report:
                 precision    recall  f1-score   support

Non-Informative       0.85      0.72      0.78       254
    Informative       0.50      0.69      0.58       106

       accuracy                           0.71       360
      macro avg       0.67      0.70      0.68       360
   weighted avg       0.75      0.71      0.72       360

confusion matrix:
[[182  72]
 [ 33  73]]
Test Case Text:
!!!!سلام برنامه خوبیه جدا
Prediction:
0
********************************************************************************
Test Case Text:
لود نمیشه اصلا!! :((((
Prediction:
0
********************************************************************************
Test Case Text:
پولم رو پس نمیدید چرا؟؟؟
Prediction:
1
********************************************************************************
Test Case Text:
بازی جالبیه.
Prediction:
0
********************************************************************************
Test Case Text:
خیلییییی لگ داره روی گوشیم.
Pre