In [1]:
!pip install hazm parsivar

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl (892 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.6/892.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting parsivar
  Downloading parsivar-0.2.3.1-py3-none-any.whl (18.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
pip install nltk



In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline

import missingno as msgn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import hazm
import nltk
from hazm import word_tokenize
from nltk import bigrams
from parsivar import Tokenizer, FindStems
from hazm.utils import stopwords_list
from gensim.models import Word2Vec
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import svm
from imblearn.over_sampling import SMOTE
import catboost
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import multiprocessing

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
def remove_non_persian(text):
    # Persian alphabet characters including Persian numerals and punctuation
    persian_alphabet = (
        'ا' 'ب' 'پ' 'ت' 'ث' 'ج' 'چ' 'ح' 'خ' 'د' 'ذ' 'ر' 'ز' 'ژ' 'س' 'ش'
        'ص' 'ض' 'ط' 'ظ' 'ع' 'غ' 'ف' 'ق' 'ک' 'گ' 'ل' 'م' 'ن' 'و' 'ه' 'ی'
        'ء' 'آ' 'ئ' 'ؤ' 'ى' 'ۀ' ' '
    )

    punc = ('!' '?' '؟' '.')
    # Persian numerals
    persian_numerals = '۰۱۲۳۴۵۶۷۸۹'

    # Combine Persian alphabet and numerals
    persian_characters = persian_alphabet + persian_numerals + punc

    # Create a regex pattern to match any character not in the Persian characters set
    pattern = f'[^{persian_characters}]'

    # Use re.sub() to replace all non-Persian characters with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [6]:
normalizer = hazm.Normalizer()

def preprocess_text_1(text):

    text = normalizer.normalize(text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)

    # convert arabic alphabet to persian
    text = re.sub(pattern = 'ك' , repl = 'ک', string = text)
    text = re.sub(pattern = 'ي' , repl = 'ی', string = text)
    text = re.sub(pattern = 'ى' ,repl = 'ی', string = text)
    text = re.sub(pattern = 'ئ', repl = 'ی', string = text)
    text = re.sub(pattern = 'إ',repl =  'ا', string = text)
    text = re.sub(pattern = 'أ' , repl = 'ا', string = text)
    text = re.sub(pattern = 'ؤ' , repl = 'و', string = text)
    text = re.sub(pattern = 'آ' , repl = 'ا', string = text)

    return text

def text_rokenizer(text):
    tokens = word_tokenize(text)
    return tokens

In [9]:
tokenizer = Tokenizer()
stopwords = stopwords_list()

def tokenize(text):
    return tokenizer.tokenize_words(text)

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords]

def preprocess_text(text, use_stemming=False):
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    return ' '.join(tokens)

def preprocess_series(text_series, use_stemming=False):
    return text_series.apply(lambda x: preprocess_text(x, use_stemming))

# Training

In [10]:
data = pd.read_excel('/content/train_data.xlsx', header=None)

In [11]:
X, y = data[0], data[1]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify = y)

In [12]:
x_train = x_train.apply(preprocess_text_1)
x_test = x_test.apply(preprocess_text_1)

x_train = x_train.apply(remove_non_persian)
x_test = x_test.apply(remove_non_persian)

x_train = preprocess_series(x_train)
print('Preprocessing for train done.')

x_test = preprocess_series(x_test)
print('Preprocessing for test done.')

Preprocessing for train done.
Preprocessing for test done.


In [13]:
maping = {'SAD' : 0, 'OTHER' : 1, 'HAPPY': 2, 'ANGRY': 3, 'FEAR': 4}
y_train = y_train.map(maping)
y_test = y_test.map(maping)

## load fasttext

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import zipfile
import os

zip_file_path = '/content/drive/My Drive/fasttext_model.zip'

extract_dir = '/content/extracted/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Files extracted to {extract_dir}")


Files extracted to /content/extracted/


In [16]:
from hazm import WordEmbedding

In [17]:
embedding = WordEmbedding(model_type='fasttext')

In [18]:
embedding.load_model('/content/extracted/fasttext_skipgram_300.bin')

In [19]:
def get_embedding_vector(words):
    vectors = [embedding[word] for word in words]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding.vector_size)

In [20]:
tokenized_tweets_train = [word_tokenize(tweet) for tweet in x_train]
tokenized_tweets_test = [word_tokenize(tweet) for tweet in x_test]

df_train = np.array([get_embedding_vector(tweet) for tweet in tokenized_tweets_train])
df_test = np.array([get_embedding_vector(tweet) for tweet in tokenized_tweets_test])

In [21]:
# Visualization
def plot_classification_report(y_true, y_pred, title = ''):
    report = classification_report(y_true, y_pred, output_dict=True)
    df = pd.DataFrame(report).transpose()
    sns.heatmap(df.iloc[:-1, :-1], annot=True, cmap="YlGnBu")
    plt.title(title)
    plt.show()

In [22]:
def classification_results(y_pred, y_true, phase, method = ''):
    print(f"{method} Classification Report {phase}:")
    print(classification_report(y_true, y_pred))

In [23]:
svc = svm.SVC(kernel = 'rbf', degree = 1, C= 0.8)
svc.fit(df_train, y_train)

In [24]:
classification_results(y_true=y_train, y_pred=svc.predict(df_train), phase='Train')

 Classification Report Train:
              precision    recall  f1-score   support

           0       0.68      0.73      0.71       760
           1       0.58      0.81      0.68      1136
           2       0.92      0.78      0.85      1316
           3       0.77      0.62      0.68       911
           4       0.77      0.42      0.54       308

    accuracy                           0.72      4431
   macro avg       0.74      0.67      0.69      4431
weighted avg       0.75      0.72      0.72      4431



In [25]:
classification_results(y_true=y_test, y_pred=svc.predict(df_test), phase='Test')

 Classification Report Test:
              precision    recall  f1-score   support

           0       0.63      0.62      0.63        85
           1       0.53      0.77      0.63       127
           2       0.92      0.80      0.86       146
           3       0.66      0.50      0.57       101
           4       0.52      0.32      0.40        34

    accuracy                           0.67       493
   macro avg       0.65      0.61      0.62       493
weighted avg       0.69      0.67      0.67       493



# unlabled data

In [26]:
data = pd.read_csv('/content/3rdHW_test.csv', header=None)

In [27]:
test_data = data[0]

In [28]:
test_data = test_data.apply(preprocess_text_1)

test_data = test_data.apply(remove_non_persian)

test_data = preprocess_series(test_data)
print('Preprocessing for test done.')

tokenized_tweets_test = [word_tokenize(tweet) for tweet in test_data]

df_test = np.array([get_embedding_vector(tweet) for tweet in tokenized_tweets_test])

Preprocessing for test done.


In [29]:
svc = svm.SVC(kernel = 'rbf')
svc.fit(df_train, y_train)

In [30]:
accuracy_score(y_train, svc.predict(df_train))

0.7379823967501693

In [31]:
y_pred= svc.predict(df_test)

In [32]:
maping = {0:'SAD', 1:'OTHER', 2:'HAPPY',3: 'ANGRY', 4:'FEAR'}

In [33]:
y_pred = pd.Series(y_pred).map(maping)

In [34]:
data['Y'] = y_pred

In [35]:
data.columns = ['X', 'Y']

In [36]:
data.to_csv('output.csv')