In [339]:
import os 
import math
import requests

import pandas as pd 
import numpy as np 
from collections import defaultdict
from string import punctuation
from parsivar import Tokenizer, Normalizer


from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, recall_score

In [170]:
LABELS = {
    "اجتماعی" : "social",
    "اديان" : "religion",
    "اقتصادی" : "economics",
    "سیاسی" : "politics",
    "فناوري" : "technology" ,
    "مسائل راهبردي ايران" : "strategic" ,
    "ورزشی" : "sport",
}

In [None]:
persian = requests.get('https://raw.githubusercontent.com/kharazi/persian-stopwords/master/persian').text.split('\n')
verbal = requests.get('https://raw.githubusercontent.com/kharazi/persian-stopwords/master/verbal').text.split('\n')
nonverbal = requests.get('https://raw.githubusercontent.com/kharazi/persian-stopwords/master/nonverbal').text.split('\n')

stop_words = persian + verbal + nonverbal

In [283]:
train_texts = []
train_labels = []

test_texts = []
test_labels = []

def extract_texts (root, label , file, is_train) :
    path = root + '/' + file 
    with open(path,'r',encoding="utf-8") as f : 
        text = f.read()
        
        if is_train :
            train_texts.append(text)
            train_labels.append(LABELS[label])
        else :
            test_texts.append(text)
            test_labels.append(LABELS[label])

In [284]:
for (root,dirs,files) in os.walk('./Final_Dataset/Train') :
    
    if len(files) == 0 :
        continue
    for f in files :
        label = root.split('\\')[1]
        extract_texts(root,label,f, True)
    

In [285]:
train = pd.DataFrame()
train['text'] = train_texts 
train['label'] = train_labels

In [286]:
train.head()

Unnamed: 0,text,label
0,﻿ به گزارش ايسنا، مهندس خرم، وزير راه و ترابر...,social
1,به گزارش خبرنگار اجتماعي خبرگزاري دانشجويان ...,social
2,دكتر امان‌الله قرايي‌مقدم، عضو هيات علمي دان...,social
3,به گزارش خبرنگار پارلماني ايسنا، در اين گزار...,social
4,به گزارش ايسنا، معاون اجتماعي وزير كشور معتق...,social


In [287]:
for (root,dirs,files) in os.walk('./Final_Dataset/Test') :
    
    if len(files) == 0 :
        continue
    for f in files :
        label = root.split('\\')[1]
        
        extract_texts(root,label,f, False)
    

In [288]:
test = pd.DataFrame()
test['text'] = test_texts 
test['label'] = test_labels

In [289]:
test

Unnamed: 0,text,label
0,﻿ شهردار تهران شب گذشته در برنامه زنده تلويزي...,social
1,﻿ معاون پرورشي و تربيت بدني وزارت آموزش و پرو...,social
2,حجت الاسلام والمسلمين سيد محمد رضا غياثي کرم...,religion
3,"سمينار ""اسلام و آينده و نسل جديد"" 24 آذرماه ...",religion
4,ماده 69 قانون برنامه پنجم به موضوع بهره‌وري ...,economics
5,﻿ به گزارش خبرگزاري دانشجويان ايران (ايسنا) ا...,economics
6,به گزارش خبرنگار سياسي ايسنا،‌ محمدي در ابتد...,politics
7,﻿ خبرگزاري دانشجويان ايران نيز در راستاي منوي...,politics
8,مجري پروژ‌ه‌هاي ملي و طرح ucf اصفهان معتقد ا...,technology
9,﻿ بر اساس برنامه اعلام شده توسط سازمان فضايي ...,technology


In [290]:
my_punctuation = punctuation + '،"؛«»)\('

In [291]:
tokenizer = Tokenizer()
normalizer = Normalizer()

In [347]:
def normalize_text (text, remove_stop_words=True) :
    tokens = tokenizer.tokenize_words(normalizer.normalize(text))
    if remove_stop_words :
        tokens = [word for word in tokens if word not in stop_words]
    return ' '.join([word for word in tokens if word not in list(my_punctuation)])

In [349]:
train.loc[:].text = train.text.apply(normalize_text, remove_stop_words=True)

In [350]:
test.loc[:].text = test.text.apply(normalize_text, remove_stop_words=True)

In [351]:
tokens = defaultdict(int)

def count_tokens (text) :
    words = tokenizer.tokenize_words(text) 
    for word in words : 
        tokens[word] += 1 
        
train.text.apply(count_tokens )
test.text.apply(count_tokens)

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
Name: text, dtype: object

In [352]:
tokens = dict(sorted(tokens.items(), key=lambda item: item[1], reverse=True)[:500])

In [354]:
token_keys = list(tokens.keys())

In [376]:
train_vectors = []
for row in train.loc[:].values :
    text = row[0]
    label = row[1] 
    words = tokenizer.tokenize_words(text) 
    new_record = {token : words.count(token) for token in token_keys}
    new_record.update({'label':label})
    train_vectors.append(new_record)
    
train_vectors = pd.DataFrame.from_dict(train_vectors)

In [356]:
train_vectors.shape

(56, 501)

In [377]:
test_vectors = []
for row in test.loc[:].values :
    text = row[0]
    label = row[1] 
    words = tokenizer.tokenize_words(text) 
    new_record = {token : words.count(token) for token in token_keys}
    new_record.update({'label':label})
    test_vectors.append(new_record)
    
test_vectors = pd.DataFrame.from_dict(test_vectors)

In [358]:
test_vectors.shape

(14, 501)

In [359]:
scaler = StandardScaler(copy=False,with_mean=False)

In [360]:
standard_train_vectors = scaler.fit_transform(train_vectors.drop('label',axis=1))

In [361]:
x_train,x_test , y_train,y_test = train_test_split(standard_train_vectors, train_vectors.label, test_size=0.3 , \
                                                  random_state=101, stratify=train_vectors.label)

# Using Multinomial Naive Bayes for classification

In [362]:
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [363]:
y_pred = model.predict(x_test)

In [364]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

   economics       0.50      0.50      0.50         2
    politics       0.67      0.67      0.67         3
    religion       1.00      1.00      1.00         2
      social       0.00      0.00      0.00         3
       sport       1.00      1.00      1.00         2
   strategic       1.00      0.50      0.67         2
  technology       0.43      1.00      0.60         3

    accuracy                           0.65        17
   macro avg       0.66      0.67      0.63        17
weighted avg       0.61      0.65      0.60        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [365]:
confusion_matrix(y_test, y_pred)

array([[1, 1, 0, 0, 0, 0, 0],
       [0, 2, 0, 0, 0, 0, 1],
       [0, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 3]], dtype=int64)

In [366]:
accuracy_score(y_test, y_pred)

0.6470588235294118

In [367]:
recall_score(y_test, y_pred, average='macro')

0.6666666666666666

In [368]:
f1_score(y_test, y_pred, average='macro')

0.6333333333333332

# Using KNN algorithm for classification

In [371]:
ks = [1,3,5,15]
for k in ks:
    knn_model = KNeighborsClassifier(n_neighbors=k )
    knn_model.fit(x_train,y_train)
    y_pred = knn_model.predict(x_test)
    print(f'K = {k}')
    print(f'accuracy = {accuracy_score(y_test,y_pred)}')
    print(f"recall = {recall_score(y_test, y_pred, average='macro')}")
    print(f"f1_score = {f1_score(y_test, y_pred, average='macro')}")
    print(confusion_matrix(y_test,y_pred))
    
    print('----------')

K = 1
accuracy = 0.4117647058823529
recall = 0.40476190476190477
f1_score = 0.38095238095238093
[[1 0 0 1 0 0 0]
 [0 0 1 1 0 1 0]
 [0 0 1 1 0 0 0]
 [0 0 0 3 0 0 0]
 [0 0 1 0 1 0 0]
 [0 0 1 1 0 0 0]
 [0 0 0 2 0 0 1]]
----------
K = 3
accuracy = 0.17647058823529413
recall = 0.21428571428571427
f1_score = 0.13333333333333333
[[0 0 1 0 0 1 0]
 [0 0 2 1 0 0 0]
 [0 0 2 0 0 0 0]
 [1 0 2 0 0 0 0]
 [0 0 1 0 1 0 0]
 [0 0 2 0 0 0 0]
 [0 0 3 0 0 0 0]]
----------
K = 5
accuracy = 0.23529411764705882
recall = 0.2857142857142857
f1_score = 0.20476190476190476
[[1 0 1 0 0 0 0]
 [0 0 2 0 0 1 0]
 [0 0 2 0 0 0 0]
 [1 0 2 0 0 0 0]
 [0 0 1 0 1 0 0]
 [0 0 2 0 0 0 0]
 [0 0 3 0 0 0 0]]
----------
K = 15
accuracy = 0.35294117647058826
recall = 0.35714285714285715
f1_score = 0.3154761904761904
[[1 0 0 0 0 0 1]
 [0 0 0 0 0 0 3]
 [0 0 1 0 0 0 1]
 [1 0 0 0 0 0 2]
 [0 0 0 0 1 0 1]
 [0 0 0 0 0 0 2]
 [0 0 0 0 0 0 3]]
----------


# Calculating TF-IDF for tokens

In [378]:
idf_tokens = defaultdict(int)
num_of_all_texts = train.shape[0] + test.shape[0]
for key,value in tokens.items() :
    idf = sum(train.text.apply(lambda text : 1 if key in text else 0 ))
    idf += sum(test.text.apply(lambda text : 1 if key in text else 0 ))
    idf = math.log10(num_of_all_texts / idf)
    
    tfidf_tokens[key] =  idf 

In [379]:
def replace_by_tfidf (vector) :
    l = []
    for key, value in zip(vector.index,vector.values) :
        l.append(value * tfidf_tokens[key])
    
    return l


train_labels = train_vectors.label
train_vectors = pd.DataFrame.from_records(train_vectors.drop('label',axis=1).apply(replace_by_tfidf,axis=1), columns= list(train_vectors.columns).remove('label'))
train_vectors['label'] = train_labels

test_labels = test_vectors.label
test_vectors = pd.DataFrame.from_records(test_vectors.drop('label', axis=1).apply(replace_by_tfidf, axis=1), columns = list(test_vectors.columns).remove('label'))
test_vectors['label'] = test_labels

In [380]:
x_train,x_test ,y_trian,y_test = train_test_split(train_vectors.drop('label', axis=1), train_vectors.label, test_size=0.3, \
                                                 random_state=101, stratify=train_vectors.label)

In [381]:
ks = [1,3,5,15]
for k in ks:
    knn_model = KNeighborsClassifier(n_neighbors=k )
    knn_model.fit(x_train,y_train)
    y_pred = knn_model.predict(x_test)
    print(f'K = {k}')
    print(f'accuracy = {accuracy_score(y_test,y_pred)}')
    print(f"recall = {recall_score(y_test, y_pred, average='macro')}")
    print(f"f1_score = {f1_score(y_test, y_pred, average='macro')}")
    print(confusion_matrix(y_test,y_pred))
    
    print('----------')

K = 1
accuracy = 0.4117647058823529
recall = 0.4285714285714285
f1_score = 0.35634920634920636
[[0 0 1 0 0 0 1]
 [0 0 0 1 0 0 2]
 [0 0 2 0 0 0 0]
 [1 0 0 1 0 0 1]
 [0 0 0 0 2 0 0]
 [0 0 0 2 0 0 0]
 [0 0 0 1 0 0 2]]
----------
K = 3
accuracy = 0.29411764705882354
recall = 0.33333333333333337
f1_score = 0.25824175824175827
[[1 0 1 0 0 0 0]
 [0 0 1 0 0 0 2]
 [0 0 2 0 0 0 0]
 [1 0 2 0 0 0 0]
 [0 0 1 0 1 0 0]
 [0 0 2 0 0 0 0]
 [0 0 2 0 0 0 1]]
----------
K = 5
accuracy = 0.29411764705882354
recall = 0.33333333333333337
f1_score = 0.2857142857142857
[[1 0 1 0 0 0 0]
 [0 0 3 0 0 0 0]
 [0 0 2 0 0 0 0]
 [0 0 3 0 0 0 0]
 [0 0 0 0 1 0 1]
 [0 0 2 0 0 0 0]
 [0 0 2 0 0 0 1]]
----------
K = 15
accuracy = 0.29411764705882354
recall = 0.2857142857142857
f1_score = 0.19999999999999998
[[1 0 0 0 0 0 1]
 [2 0 0 0 0 0 1]
 [0 0 0 0 0 0 2]
 [1 0 0 0 0 0 2]
 [0 0 0 0 1 0 1]
 [0 0 0 0 0 0 2]
 [0 0 0 0 0 0 3]]
----------


If set remove_stop_words parameter to **True** in *normalize_text* method we can see that performance of both KNN and naive bayes models will increase!