In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import make_pipeline 
from sklearn.pipeline import Pipeline
import nltk
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, confusion_matrix

In [11]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin
from sklearn.preprocessing import FunctionTransformer

from nltk.corpus import stopwords
stopwords = stopwords.words('turkish')

In [2]:
#Read datasets
test = pd.read_csv("dataset/test.csv")
train = pd.read_csv("dataset/train.csv")

In [3]:
#Convert lowercase
train["label"] = train["label"].str.lower()
test["label"] = test["label"].str.lower()

In [4]:
#TRAIN
#category --> int
train["label"] = train.label.replace('dunya',0)
train["label"] = train.label.replace('spor',1)
train["label"] = train.label.replace('turkiye',2)
train["label"] = train.label.replace('video',3)
train["label"] = train.label.replace('yazarlar',4)


In [5]:
#TEST
#category --> int
test["label"] = test.label.replace('dunya',0)
test["label"] = test.label.replace('spor',1)
test["label"] = test.label.replace('turkiye',2)
test["label"] = test.label.replace('video',3)
test["label"] = test.label.replace('yazarlar',4)

In [6]:
#Drop labels of train
train_labels = train["label"]
train = train.drop(["label"], axis=1)

In [7]:
#Drop labels of test 
test_labels = test["label"]
test = test.drop(["label"], axis=1)

In [8]:
#Convert lowercase
train["text"] = train["text"].str.lower()
test["text"] = test["text"].str.lower()

In [10]:
#Tokenize
train['tokenized'] = train.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
test['tokenized']  = test.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

Naive Bayes Classification

In [12]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [16]:
tfidf =  TfidfVectorizer()
clf = Pipeline([('tfidfvectorizer', TfidfVectorizer()), 
                 ("to_dense", FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
                ('gaussiannb', GaussianNB()) 
                 ])


params = {"tfidfvectorizer__min_df": [50, 75, 100], #uzaklaştıröa 500 mesela
          "tfidfvectorizer__max_df": [200, 225, 250],
         "tfidfvectorizer__stop_words": [None, stopwords]
         }


grid = GridSearchCV(clf, params, cv = 3, verbose=1, scoring="accuracy")

In [17]:
sorted(clf.get_params().keys())

['gaussiannb',
 'gaussiannb__priors',
 'memory',
 'steps',
 'tfidfvectorizer',
 'tfidfvectorizer__analyzer',
 'tfidfvectorizer__binary',
 'tfidfvectorizer__decode_error',
 'tfidfvectorizer__dtype',
 'tfidfvectorizer__encoding',
 'tfidfvectorizer__input',
 'tfidfvectorizer__lowercase',
 'tfidfvectorizer__max_df',
 'tfidfvectorizer__max_features',
 'tfidfvectorizer__min_df',
 'tfidfvectorizer__ngram_range',
 'tfidfvectorizer__norm',
 'tfidfvectorizer__preprocessor',
 'tfidfvectorizer__smooth_idf',
 'tfidfvectorizer__stop_words',
 'tfidfvectorizer__strip_accents',
 'tfidfvectorizer__sublinear_tf',
 'tfidfvectorizer__token_pattern',
 'tfidfvectorizer__tokenizer',
 'tfidfvectorizer__use_idf',
 'tfidfvectorizer__vocabulary',
 'to_dense',
 'to_dense__accept_sparse',
 'to_dense__func',
 'to_dense__inv_kw_args',
 'to_dense__inverse_func',
 'to_dense__kw_args',
 'to_dense__pass_y',
 'to_dense__validate']

In [18]:

grid.fit(train["text"], train_labels)


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  3.9min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i..._args=None, pass_y='deprecated',
          validate=True)), ('gaussiannb', GaussianNB(priors=None))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfidfvectorizer__min_df': [50, 75, 100], 'tfidfvectorizer__max_df': [200, 225, 250], 'tfidfvectorizer__stop_words': [None, ['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi'...', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani']]},
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [19]:
pred = grid.predict(test["text"])

In [20]:
accuracy_score(pred, test_labels)

0.702

In [49]:
f1_score(pred, test_labels, average=None)

array([0.68525896, 0.84727756, 0.4964132 , 0.64551422, 0.81313703])

In [50]:
confusion_matrix(pred, test_labels)

array([[258,   2,  58,  27,  13],
       [  8, 319,  14,  22,   6],
       [ 34,   9, 173,  47,  13],
       [ 69,  15, 126, 295,   1],
       [ 26,  39,  50,  17, 359]], dtype=int64)

Logistic Regression

In [26]:
sorted(clf.get_params().keys())

['logisticregression',
 'logisticregression__C',
 'logisticregression__class_weight',
 'logisticregression__dual',
 'logisticregression__fit_intercept',
 'logisticregression__intercept_scaling',
 'logisticregression__max_iter',
 'logisticregression__multi_class',
 'logisticregression__n_jobs',
 'logisticregression__penalty',
 'logisticregression__random_state',
 'logisticregression__solver',
 'logisticregression__tol',
 'logisticregression__verbose',
 'logisticregression__warm_start',
 'memory',
 'steps',
 'tfidfvectorizer',
 'tfidfvectorizer__analyzer',
 'tfidfvectorizer__binary',
 'tfidfvectorizer__decode_error',
 'tfidfvectorizer__dtype',
 'tfidfvectorizer__encoding',
 'tfidfvectorizer__input',
 'tfidfvectorizer__lowercase',
 'tfidfvectorizer__max_df',
 'tfidfvectorizer__max_features',
 'tfidfvectorizer__min_df',
 'tfidfvectorizer__ngram_range',
 'tfidfvectorizer__norm',
 'tfidfvectorizer__preprocessor',
 'tfidfvectorizer__smooth_idf',
 'tfidfvectorizer__stop_words',
 'tfidfvectoriz

In [38]:
model_logistic = Pipeline([('tfidfvectorizer', TfidfVectorizer()), ('logisticregression', LogisticRegression())])


params = {"tfidfvectorizer__min_df": [50, 75, 100], 
          "tfidfvectorizer__max_df": [200, 250, 500],
         "logisticregression__C": [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
          "logisticregression__solver" : ["newton-cg", "lbfgs", "sag", "saga"],
          "logisticregression__penalty" : ["l2"]
          
         }
#solver panalty

grid = GridSearchCV(model_logistic, params, cv = 3, verbose=1, scoring="accuracy")

In [39]:
grid.fit(train["text"], train_labels)

Fitting 3 folds for each of 252 candidates, totalling 756 fits




[Parallel(n_jobs=1)]: Done 756 out of 756 | elapsed: 51.7min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfidfvectorizer__min_df': [50, 75, 100], 'tfidfvectorizer__max_df': [200, 250, 500], 'logisticregression__C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10], 'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'logisticregression__penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [40]:
result = grid.predict(test["text"])


In [41]:
accuracy_score(result, test_labels)

0.824

In [44]:
f1_score(result, test_labels, average=None)

array([0.81343284, 0.92005242, 0.7190184 , 0.75980392, 0.91521197])

In [47]:
confusion_matrix(result, test_labels)

array([[327,  10,  38,  30,   4],
       [  4, 351,   4,  13,   7],
       [ 27,   8, 293,  54,  12],
       [ 29,   6,  61, 310,   2],
       [  8,   9,  25,   1, 367]], dtype=int64)