<a href="https://colab.research.google.com/github/sadra-barikbin/persian-sentiment-analysis-example/blob/main/sentiment-classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!pip install clean-text[gpl]

In [51]:
import numpy as np
import pandas as pd
import torch
import cleantext
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler

# Loading & Preparing Data

In [None]:
!gdown 'https://drive.google.com/uc?id=1HH8QFDcvkKfnj4dWmFQceb3PpNqDD8HQ&authuser=0&export=download'
!gdown 'https://drive.google.com/uc?id=1uDOO8RP7Lr9qcRJO8z3d10qm_UggJv4I&authuser=0&export=download'

In [70]:
train = pd.read_csv('train.csv')
eval = pd.read_csv('eval.csv')

In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,comment,rate
0,2587,پردازنده های Core i5 و Core i3 نیز ذاتا دو هست...,0.0
1,22591,سلام به دوستای عزیزم \nعزاداری هاتون قبول باشه,1.0
2,141037,کلا پولتون رو دور نریزیزد,-1.0
3,58593,از صمیم قلب امیدوارم دایانا با کارن بمونه و پو...,1.0
4,5712,آنطور که اپل ادعا می کند آیپاد شافل دارای طراح...,1.0


In [6]:
eval.head()

Unnamed: 0.1,Unnamed: 0,comment,rate
0,61591,کیفیت غذا و زمان رسیدن عالی بود,-1.0
1,50299,در‌ حد ساندویچ یه نفره بود نه دونفره یا بمب. ک...,1.0
2,2777,طعم پیتزای چهار فصل مثل همشه خیلی خوب بود اما ...,-1.0
3,9126,مشخصات سخت افزاری مناسب در کنار سیستم عامل وین...,0.5
4,7544,مرغش سوخاری و خوشمزه بود، بسته بندی عالی، قیمت...,-1.0


In [71]:
train = train.drop(train.columns[0], axis=1)
eval = eval.drop(eval.columns[0], axis=1)

In [10]:
print(f"{len(train[train.rate == 0])} out of {len(train)} train comments have rate zero.")
print(f"{len(eval[eval.rate == 0])} out of {len(eval)} eval comments have rate zero.")

104 out of 800 train comments have rate zero.
30 out of 200 eval comments have rate zero.


In [72]:
train = train[train.rate != 0]
eval  = eval[eval.rate != 0]

In [73]:
train['rate'] = train.rate.apply(lambda r: 1 if r > 0 else 0)
eval['rate'] = eval.rate.apply(lambda r: 1 if r > 0 else 0)

## Balancing Dataset
As you can see below, data is imbalanced. We use over-sampling strategy on negative class to mitigate the problem.

In [74]:
pd.concat([train.rate.value_counts().rename('train'),
           eval.rate.value_counts().rename('eval')], axis=1)

Unnamed: 0,train,eval
1,502,115
0,194,55


In [75]:
balancer = RandomOverSampler(random_state=41)
train, _ = balancer.fit_resample(train, train.rate)
eval, _ = balancer.fit_resample(eval, eval.rate)

## Normalization

In [76]:
params = {'to_ascii':False, 'no_urls':True,    'no_phone_numbers':True, 'no_line_breaks':True,
          'no_emails':True, 'no_numbers':True, 'no_digits':True,        'no_currency_symbols':True}

train['comment'] = train.comment.apply(lambda c: cleantext.clean(c,**params))
eval['comment'] = eval.comment.apply(lambda c: cleantext.clean(c,**params))

# Method 1: Linear Models
We make use of Logistic Regression and SVM as classifiers, and for vectorizing the comments, Tfidf is used.

In [77]:
vectorizer = TfidfVectorizer()

In [84]:
pipeline = Pipeline([('embedding', vectorizer),
                     ('classifier', 'passthrough')])

## Hyper-parameter Tuning
We search over different settings and find the best.

In [91]:
param_grid = {'embedding__ngram_range': [(1,2),(1,3),(1,4)],
              'embedding__max_features': range(100, 3000, 100),
              'classifier': [SVC(),LogisticRegression()]}

In [92]:
# The smelling code here is due to Scikit GridSearchCV's specific input for `cv` parameter.
# GridSearchCV and other meta-estimators in Scikit accept whole data (train+eval) in their `fit`
# method. So if you have a dataset separated in train and eval parts beforehand, you should
# concatenate them. Beside that you have to give indices of train and eval parts as the `cv` parameter.

train_eval = pd.concat((train, eval), ignore_index=True)
train_eval_indices = [(train.index, eval.index + len(train))]
meta_estimator = GridSearchCV(pipeline, param_grid, scoring=['accuracy', 'f1'],
                              cv=train_eval_indices, refit='f1', verbose=1)
_ = meta_estimator.fit(train_eval.comment, train_eval.rate)

Fitting 1 folds for each of 174 candidates, totalling 174 fits


In [93]:
meta_estimator.best_params_

{'classifier': SVC(),
 'embedding__max_features': 500,
 'embedding__ngram_range': (1, 4)}

In [94]:
print(f"Best model F1: {meta_estimator.best_score_}")

Best model F1: 0.6979865771812082


## Determining Marker Features

# Method 2: Neural Networks

# Method 3: Pre-trained Language Models