In [1]:
import re
import nltk
import string
import pandas as pd
from sklearn import preprocessing
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score
from jpype import JClass, JString, getDefaultJVMPath, shutdownJVM, startJVM, java

In [2]:
import jpype

In [3]:
df = pd.read_csv("7allV03.csv")

In [4]:
df.head()

Unnamed: 0,category,text
0,siyaset,3 milyon ile ön seçim vaadi mhp nin 10 olağan...
1,siyaset,mesut_yılmaz yüce_divan da ceza alabilirdi pr...
2,siyaset,disko lar kaldırılıyor başbakan_yardımcısı ar...
3,siyaset,sarıgül anayasa_mahkemesi ne gidiyor mustafa_...
4,siyaset,erdoğan idamın bir haklılık sebebi var demek ...


In [5]:
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(df.category)

In [6]:
y

array([4, 4, 4, ..., 6, 6, 6])

In [7]:
def tokenizasyon(text):
    return word_tokenize(text)
def lemmatizer(text):
    lemma_words = []
    ZEMBEREK_PATH = 'zemberek-full_old.jar'
    if jpype.isJVMStarted() is False:
        startJVM(getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))
    TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
    morphology = TurkishMorphology.createWithDefaults()
    for text in tokenizasyon(text):
        lemma_word = str(morphology.analyzeAndDisambiguate(str(text)).bestAnalysis()[0].getLemmas()[0])
        lemma_words.append(lemma_word)
    text = ' '.join(lemma_words)
    return text

def convert_lowercase(text):
    return text.lower()
def remove_punctuation(text):
    return ''.join(d for d in text if d not in string.punctuation)
def remove_stopwords(text):
    stopwords = []
    with open('stopwords.txt', 'r',encoding='utf-8') as f:
        for word in f:
            word = word.split('\n')
            stopwords.append(word[0])
    clean_text = ' '.join(s for  s in text.split() if s not in stopwords)
    return clean_text
def remove_numbers(text):
    text = re.sub(r'\d', '', text)
    return text
def remove_less_than_2(text):
    text = ' '.join([w for w in text.split() if len(w)>2])
    return text
def remove_extra_space(text):
    ornek_text_strip = re.sub(' +', ' ', text)
    return ornek_text_strip.strip()

In [8]:
df['text'] = df['text'].apply(lemmatizer)
df['text'] = df['text'].apply(convert_lowercase)
df['text'] = df['text'].apply(remove_punctuation)
df['text'] = df['text'].apply(remove_stopwords)
df['text'] = df['text'].apply(remove_extra_space)
df['text'] = df['text'].apply(remove_numbers)
df['text'] = df['text'].apply(remove_less_than_2)

In [9]:
texts = []
for text in df.text:
    texts.append(text)

In [None]:
# texts=[]

# for text in df.text:
#     text = text.lower()
#     text = ''.join(d for d in text if d not in string.punctuation)
#     text = remove_stopwords(text)
#     text = remove_numbers(text)
#     text = remove_less_than_2(text)
#     text = remove_extra_space(text)
#     texts.append(text)

### 1. CountVectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
max_features=500
cv = CountVectorizer(max_features=max_features) 
X = cv.fit_transform(texts).toarray()

In [11]:
X.shape

(4900, 500)

### 2. TFidfVectorizer

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features=500 # "number" most common(used) words in reviews
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000) 
X = tfidf.fit_transform(texts).toarray()

In [None]:
X.shape

In [None]:
y.shape

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


lgbm = LGBMClassifier()
xgb = XGBClassifier()

lgbm.fit(X_train,y_train)
xgb.fit(X_train,y_train)
y_pred_lgbm = lgbm.predict(X_test)
y_pred_xgb = xgb.predict(X_test)

In [18]:
print("Accuracy of tf-idf + lgbm:",accuracy_score(y_test, y_pred_lgbm))
print("Precision of tf-idf + lgbm:",precision_score(y_test, y_pred_lgbm,average="micro"))

print("Accuracy of countvectorizer + lgbm:",accuracy_score(y_test, y_pred_lgbm))
print("Precision of countvectorizer + lgbm:",precision_score(y_test, y_pred_lgbm,average="micro"))

print("Accuracy of countvectorizer + xgb:",accuracy_score(y_test, y_pred_xgb))
print("Precision of countvectorizer + xgb:",precision_score(y_test, y_pred_xgb,average="micro"))

print("Accuracy of tf-idf + xgb:",accuracy_score(y_test, y_pred_xgb))
print("Precision of tf-idf + xgb:",precision_score(y_test, y_pred_xgb,average="micro"))

Accuracy of tf-idf + lgbm: 0.860204081632653
Precision of tf-idf + lgbm: 0.860204081632653


Accuracy of countvectorizer + lgbm: 0.860204081632653
Precision of countvectorizer + lgbm: 0.860204081632653


In [14]:
print("Accuracy of countvectorizer + xgb:",accuracy_score(y_test, y_pred_xgb))
print("Precision of countvectorizer + xgb:",precision_score(y_test, y_pred_xgb,average="micro"))

Accuracy of countvectorizer + xgb: 0.8571428571428571
Precision of countvectorizer + xgb: 0.8571428571428571


In [19]:
print("Accuracy of tf-idf + xgb:",accuracy_score(y_test, y_pred_xgb))
print("Precision of tf-idf + xgb:",precision_score(y_test, y_pred_xgb,average="micro"))

Accuracy of tf-idf + xgb: 0.8571428571428571
Precision of tf-idf + xgb: 0.8571428571428571


## GRIDSEARCHCV

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.147, total=  10.5s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.4s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.147, total=  11.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   21.5s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.148, total=  12.5s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.148, total=  12.9s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.148, total=  13.2s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.147, total=  12.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.147, total=  11.9s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.148, total=  11.9s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.148, total=  12.3s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.700, total=  10.4s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.668, total=  10.6s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.684, total=  10.9s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.689, total=  10.9s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.677, total=  11.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.843, total=   4.3s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.825, total=   5.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.796, total=   3.7s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.778, total=   3.6s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.788, total=   2.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.804, total=   2.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.786, total=   2.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.783, total=   2.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.782, total=   2.8s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 18.4min finished


{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=10, gamma=0.001)
