In [27]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2

In [28]:
data = pd.read_excel('Principles-of-Digital-Image-Processing-Fundamental-Techniques.xlsx')

In [29]:
data

Unnamed: 0,عنوان,متن
0,ادب و هنر,\nجاودانگي در زندگي گروهي از طريق هنر \nنگاهي ...
1,ادب و هنر,\nرويدادهاي هنري جهان \nنمايشگاه هنر در خدمت د...
2,ادب و هنر,\nبرديوار نگارخانه ها \nگالري گلستان: \nنمايشگ...
3,اجتماعی,\nبازي را جدي بگيريم \nمطالعه اي مقدماتي پيرام...
4,علمی فرهنگی,\nتخته سياه و غباري كه سترده نمي شود... \nاشار...
...,...,...
12010,ورزش,\nجنگ گلادياتوري عربها، چهره زيباي فوتبال \nرا...
12011,ورزش,\nتنيس باشگاههاي كشور فتح و ذوبآهن به \nفينال ...
12012,ورزش,\nقهرماني عربستان روي ضربات پنالتي \nعربستان 4...
12013,ورزش,\nايران روي ضربان پنالتي كويت را شكست \nداد \n...


In [30]:
texts, labels = data['متن'], data['عنوان']

In [31]:
for punc in list(string.punctuation) + ['،']:
    texts = texts.str.replace(punc, '')

In [32]:
texts = texts.str.split()

In [33]:
texts = texts.apply(lambda x: list(map(lambda x_: x_.replace(' ', ''), x)))

In [34]:
n = 4
texts = texts.apply(lambda x: list(filter(lambda x_: len(x_) > n, x)))

In [35]:
texts = texts.apply(lambda x: pd.Series(x).value_counts())

In [36]:
texts = texts.fillna(0)

In [37]:
texts

Unnamed: 0,مورچگان,طباطبايي,نقاشي,مورچه,اندام,موضوع,زندگي,هستند,تابلوهاي,پايان,...,گلادياتوري,بگشاييم,فلاته,خيلاوي,خالدالمواليدبراي,لنگاوي,خودوارد,المجيد,خالدافضلي,مانندديدار
0,8.0,7.0,7.0,6.0,6.0,5.0,5.0,4.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
12013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0


In [38]:
chi2_features = SelectKBest(chi2, k=10000)
texts = chi2_features.fit_transform(texts, labels)

In [39]:
train_X, test_X, train_y, test_y = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

In [41]:
le = LabelEncoder()
le.fit(labels)

In [42]:
train_y = le.transform(train_y)
test_y = le.transform(test_y)

In [52]:
log_reg_model = LogisticRegression()
log_reg_model.fit(train_X, train_y)
y_pred = log_reg_model.predict(test_X)

In [50]:
f1_score(test_y, y_pred, average='macro')

np.float64(0.7280670098979186)

In [59]:
xg_model = XGBClassifier(tree_method='hist', device='cuda')
xg_model.fit(train_X, train_y)
y_pred = xg_model.predict(test_X)

In [60]:
f1_score(test_y, y_pred, average='macro')

np.float64(0.7388313335817939)

In [57]:
forest_model = RandomForestClassifier(n_estimators=200)
forest_model.fit(train_X, train_y)
y_pred = forest_model.predict(test_X)

In [58]:
f1_score(test_y, y_pred, average='macro')

np.float64(0.6655139164787941)

In [61]:
cross_val_score(xg_model, texts, le.transform(labels), cv=5, scoring='f1_macro')

array([0.66180739, 0.72105621, 0.7387253 , 0.74280747, 0.74224981])

In [51]:
cross_val_score(log_reg_model, texts, le.transform(labels), cv=5, scoring='f1_macro')

array([0.64637805, 0.67984878, 0.71308432, 0.71245257, 0.70541878])

In [None]:
cross_val_score(forest_model, texts, le.transform(labels), cv=5, scoring='f1_macro')