In [None]:
import os

import numpy as np 
import pandas as pd 

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Over-ride Pyhton build-in str class to support Turkish lower-casing
class UnicodeTr(str):
    CHAR_MAP = {
        "to_upper": {
            u"ı": u"I",
            u"i": u"İ",
        },
        "to_lower": {
            u"I": u"ı",
            u"İ": u"i",
        }
    }

    def lower(self):
        for key, value in self.CHAR_MAP['to_lower'].items():
            self = self.replace(key, value)
        return self.lower()

In [None]:
import re

# regular expression for Turkish word tokenizer
def compile_word_tokenizer_regex():
    suffixes = r"[a-zğçşöüı]{3,}' ?[a-zğçşöüı]+"
    numbers = r"%\d{2,}[.,:/\d-]+"
    any_word = r"[a-zğçşöüı_+%\.()@&`’/\\\d-]+"
    punctuations = r"[a-zğçşöüı]*[,!?;:]"

    return re.compile(
        "|".join(
            [suffixes,
             numbers,
             any_word,
             punctuations
             ]
        ), re.I
    )

# hold compiled version only once for performance issues
word_tokenizer_pre_compiled_regex = compile_word_tokenizer_regex()

In [None]:
from typing import Tuple

# Main word tokenizer function dedicated to token given sentence using compiled regular expression. 
# Output is Tuple for performance issues.
def word_tokenize(sentence: str, word_regex) -> Tuple:
    try:
        words: Union[List] = word_regex.findall(sentence)
    except (re.error, TypeError):
        return ()
    else:
        # If last word ends with dot, it should be another word
        words: Union[Tuple] = tuple(words)
        if words:
            end_dots = re.search(r'\b(\.+)$', words[-1])
            if end_dots:
                dots: str = end_dots.group(1)
                words = words[:-1] + (words[-1][:-len(dots)],) + (dots,)
        return words

In [None]:
# Read data
data_path = '/kaggle/input/ttc4900/7allV03.csv'
data = pd.read_csv(data_path, encoding='utf-8', sep=',')

# white space removal at category names
data['category'] = data['category'].str.strip()

# see samples
data.head()

In [None]:
# Data distribution over classes
# As it is seen data is balanced

unique_categories = data.category.unique()
for category in unique_categories:
    print(f"{category}: {len(data.loc[data['category'] == category])}")

In [None]:
from sklearn.model_selection import train_test_split

# Split data
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

## Pre-processing

In [None]:
# Load Turkish stop words
stop_words = set()
with open('/kaggle/input/turkish-stop-words/tr_stop_words', 'r',
          encoding='utf-8') as fp:
    for line in fp:
        stop_words.add(re.sub(r'\n', '', line))

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~'
def pre_process(line: str):
    # normalization (lower case, punctuation, numbers, white space)
    line = line.translate(str.maketrans('', '', punctuation))
    line = re.sub(r'( +)|([\d\n])', ' ', UnicodeTr(line).lower().strip())
    
    # stop words removal
    line_words = word_tokenize(line, word_tokenizer_pre_compiled_regex)
    line_words = [word for word in line_words if word not in stop_words]
    return ' '.join(line_words)


# Reformat data
train["label_format"] = 0
for index, i in enumerate(range(len(train))):
    line = f'__label__{str(train.category[i])} {str(train.text[i])}'
    train.label_format[i] = pre_process(line)
train.label_format.to_csv('ttc4900.train', index=False, header=None, sep=',')

test["label_format"] = 0
for index, i in enumerate(range(len(test))):
    line = f"__label__{str(test.category[i])} {str(test.text[i])}"
    test.label_format[i] = pre_process(line)
test.label_format.to_csv('ttc4900.test', index=False, header=None, sep=',')

## Fasttext Train

In [None]:
from fasttext import train_supervised

model = train_supervised('ttc4900.train',
                         epoch=50,
                         lr=1,
                         label_prefix='__label__',
                         wordNgrams=2,
                         dim=100)

In [None]:
# Traning accuracy
model.test('ttc4900.train')

In [None]:
# Test accuracy
model.test('ttc4900.test')

In [None]:
# Quantize
model.quantize(input='ttc4900.train',
               qnorm=True,
               retrain=True,
               lr=1,
               epoch=50,
               verbose=True,
               cutoff=100_000)

model.is_quantized()  # True
model.save_model('ttc4900.model.quantised')  # just 6.4 MB

In [None]:
!ls -l

In [None]:
# Single Test
import time

text = """
Özdağ'ın açıklamalarından satır başları:

Evet, bekliyordum. Çünkü sizin programınıza katıldıktan sonra Genel Başkan başta olmak üzere Genel Merkez yetkililerinden doğrusu çok ağır, hakaret içeren bir söylemle karşılaştım. Buna 'üzülmedim' desem yalan olur. Neticede aynı siyasal hareket içerisinde, daha sonra da aynı siyasi partide, partinin kuruşundan itibaren birlikte çalıştığımız arkadaşlar... İhraç edilmeyi bekliyordum. Davet edebilirlerdi, konuşabilirdik. Bunun yerine bana çok ağır ifadelerle saldırmaya başladılar.
"""

start = time.time()
prediction = model.predict(
    pre_process(text),
    k=-1, threshold=0.4)

print(f'{prediction} in {(time.time() - start) * 1_000} miliseconds')  # 0.9534 milisecond


## **Conclusion**

With very few data, feature engineering together with Logistic Regressionm stacked with hierarchical soft-max layer enables to create an almost fit model.
Future work: this data should be augmented, and precision should be pulled up over 95. 

Please do not hesitate to contact with me, if you have a bright idea

e-mail: : apdullah.yayik@mobildev.com