In [2]:
import numpy as np
import pandas as pd

In [32]:
train_df = pd.read_csv('data/train.csv', on_bad_lines='skip')
test_df  = pd.read_csv('data/test.csv',  on_bad_lines='skip')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13177 entries, 0 to 13176
Data columns (total 1 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   	content	label	label_id  13177 non-null  object
dtypes: object(1)
memory usage: 103.1+ KB


In [5]:
train_df.head()
print(train_df.columns)
print(test_df.columns)

Index(['\tcontent\tlabel\tlabel_id'], dtype='object')
Index(['\tcontent\tlabel\tlabel_id'], dtype='object')


In [24]:
trainDf_rowCount = train_df.shape[0]
testDf_rowCount  = test_df.shape[0]

In [68]:
import re

trainTarget = np.empty(trainDf_rowCount, dtype='object')
trainData   = np.empty(trainDf_rowCount, dtype='object')

i = 0
for row in train_df['\tcontent\tlabel\tlabel_id']:
    trainTarget[i] = re.split(r'\t+', row)[-2]
    trainData[i]   = re.split(r'\t+', row)[-3]
    i += 1


In [69]:
testTarget  = np.empty(testDf_rowCount, dtype='object')
testData    = np.empty(testDf_rowCount, dtype='object')

i = 0
for row in test_df['\tcontent\tlabel\tlabel_id']:
    testTarget[i] = re.split(r'\t+', row)[-2]
    testData[i]   = re.split(r'\t+', row)[-3]
    i += 1

In [70]:
# create new dataset
trianDF = pd.DataFrame({'content': trainData, 'category': trainTarget})
testDF  = pd.DataFrame({'content': testData, 'category': testTarget})
testDF.head()

Unnamed: 0,content,category
0,حسن جوهرچی بازیگر سینما و تلویزیون ایران در گف...,فرهنگی هنری
1,به گزارش گروه بین الملل باشگاه خبرنگاران جوان ...,بین الملل
2,به گزارش خبرنگار فوتبال و فوتسال گروه ورزشی با...,ورزشی
3,به‌ گزارش گروه اقتصادی باشگاه خبرنگاران به نقل...,اقتصادی
4,به گزارش خبرنگار حوزه قرآن و عترت گروه فرهنگی ...,فرهنگی هنری


# Text Preprocessing

In [71]:
import codecs
from hazm import Normalizer

nmz = Normalizer()
stops = "\n".join(
    sorted(
        list(
            set(
                [
                    nmz.normalize(w) for w in codecs.open('persian-stopwords-master/persian', encoding='utf-8').read().split('\n') if w
                ]
            )
        )
    )
)
print(type(stops))


<class 'str'>


In [72]:
# converting str to list
stops = stops.split('\n')

In [73]:
word = "دوشنبه"
isin = word in stops 
isin

False

In [75]:
def removeUnnecessaryChars(df):
    lst = np.array(['۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹', '؛', ':', '،', '!', '؟', '.'])
    contents = []
    
    for i in range(df.shape[0]):
        container = df['content'][i]
        # remove special characters
        for char in lst:
            container = container.replace(char, "")
        # remove half-space
        container = container.replace('\u200c', " ")
        # change 2 spaces with single space
        container = container.replace('  ', " ")
        contents.append(container)


    return contents

In [77]:
trianDF.content = removeUnnecessaryChars(trianDF)
testDF.content  = removeUnnecessaryChars(testDF)

# Feature Extraction

In [79]:
from hazm import WordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words = stops, tokenizer = WordTokenizer().tokenize)

vectorizer = vectorizer.fit(trianDF.content.values)

train = vectorizer.transform(trianDF.content.values)
test  = vectorizer.transform(testDF.content.values)
train.shape, test.shape



((13177, 86551), (1621, 86551))

# Feature Selection

In [81]:
from sklearn.feature_selection import VarianceThreshold

feature_selector = VarianceThreshold(threshold=1e-5)
feature_selector = feature_selector.fit(train)

x_train = feature_selector.transform(train)
x_test  = feature_selector.transform(test)

x_train.shape, x_test.shape

((13177, 12958), (1621, 12958))

In [82]:
y_train = trianDF.category.values
y_test  = testDF.category.values

y_train.shape, y_test.shape

((13177,), (1621,))

# Model

In [84]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

svc    = SVC().fit(x_train, y_train)
y_pred = svc.predict(x_test)

In [85]:
accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall    = recall_score(y_test, y_pred, average='macro')

print(f"\tAccuracy: {accuracy:.4f}\n\tPrecision: {precision:.4f} \n\tRecall: {recall:.4f}")
print(f"Confusion-Matrix:\n{confusion_matrix(y_test, y_pred)}\n")

	Accuracy: 0.9025
	Precision: 0.9043 
	Recall: 0.9037
Confusion-Matrix:
[[171   2   0  12   5  10   3  12]
 [  5 128   2   4   3   4   0   2]
 [  0   2 187   3   4   0   0   0]
 [  6   8   9 195   0   1   1   5]
 [  1   1   0   2 225   1   0   9]
 [  6   1   0   6   0 234   1   3]
 [  0   0   1   1   0   1 134   1]
 [  5   2   1   3   7   2   0 189]]



In [96]:
news = "این یک خبر دروغ و سیاسی است که امریکا به ایران حمله کرده است"
data = {'content': news}
df_predict = pd.DataFrame(data, index=[0])

df_predict.content  = removeUnnecessaryChars(df_predict)
sampleTest  = vectorizer.transform(df_predict.content.values)
sample_x_test  = feature_selector.transform(sampleTest)

category = svc.predict(sample_x_test)
print(category)

['سیاسی']
