Выполнил: Молева А.A.

Группа: ИУ5-24М

Задание: Необходимо решить задачу классификации текстов, сформировав два варианта векторизации признаков - на основе CountVectorizer и на основе TfidfVectorizer. В качестве классификаторов необходимо использовать два классификатора:

- KNeighborsClassifier

- Complement Naive Bayes

In [37]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB

import warnings
warnings.filterwarnings('ignore')

In [39]:
df = pd.read_csv('./datasets/films.csv')

In [41]:
df

Unnamed: 0,text,target
0,This film is so bad it simply defies reality. ...,0
1,BE WARNED. This movie is such a mess. It's a c...,0
2,This movie is just plain dumb.<br /><br />From...,0
3,Artimisia was on late last night. At first I d...,1
4,Sammy Horn (Michael Des Barres) is the head ch...,0
...,...,...
24995,"First things first, I was never once scared of...",1
24996,This is the kind of film that everyone involve...,0
24997,"This movie is just too funny, a totally non-PC...",1
24998,First of all for this movie I just have one wo...,1


In [42]:
df.target.value_counts()

0    12500
1    12500
Name: target, dtype: int64

# Feature preparation

In [43]:
tfidfv = TfidfVectorizer()
tfidf_ngram_features = tfidfv.fit_transform(df['text'])
tfidf_ngram_features

<25000x74849 sparse matrix of type '<class 'numpy.float64'>'
	with 3445861 stored elements in Compressed Sparse Row format>

In [44]:
countvec = CountVectorizer()
countvec_ngram_features = countvec.fit_transform(df['text'])
countvec_ngram_features

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3445861 stored elements in Compressed Sparse Row format>

# KNeighboursClassifier

In [45]:
# TFIDF + KNC
X_train, X_test, y_train, y_test = train_test_split(tfidf_ngram_features, df['target'], test_size=0.3, random_state=1)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           1     0.7345    0.7961    0.7640      3673
           0     0.7872    0.7238    0.7542      3827

    accuracy                         0.7592      7500
   macro avg     0.7608    0.7599    0.7591      7500
weighted avg     0.7614    0.7592    0.7590      7500



In [46]:
# CountVec + KNC
X_train, X_test, y_train, y_test = train_test_split(countvec_ngram_features, df['target'], 
                                                    test_size=0.3, random_state=1)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           1     0.6310    0.5483    0.5867      3673
           0     0.6149    0.6922    0.6513      3827

    accuracy                         0.6217      7500
   macro avg     0.6229    0.6203    0.6190      7500
weighted avg     0.6228    0.6217    0.6197      7500



# Complement Naive Bayes

In [47]:
# TFIDF + CNB
X_train, X_test, y_train, y_test = train_test_split(tfidf_ngram_features, df['target'], test_size=0.3, random_state=1)
model = ComplementNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           1     0.8285    0.9061    0.8655      3673
           0     0.9009    0.8200    0.8585      3827

    accuracy                         0.8621      7500
   macro avg     0.8647    0.8630    0.8620      7500
weighted avg     0.8655    0.8621    0.8620      7500



In [48]:
# CountVec + CNB
X_train, X_test, y_train, y_test = train_test_split(countvec_ngram_features, df['target'], 
                                                    test_size=0.3, random_state=1)
model = ComplementNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           1     0.8183    0.8892    0.8523      3673
           0     0.8840    0.8106    0.8457      3827

    accuracy                         0.8491      7500
   macro avg     0.8512    0.8499    0.8490      7500
weighted avg     0.8519    0.8491    0.8489      7500



# Выводы:
1. TfidfVectorizer показал лучший результат в обоих моделях

2. Complement Naive Bayes показал лучший результат