In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expression libary.
import nltk # Natural Language toolkit
nltk.download("stopwords")  #downloading stopwords
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
nltk.download('wordnet')
import nltk as nlp
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from sklearn.feature_extraction.text import CountVectorizer 
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/ttc4900/7allV03.csv")

In [None]:
df_copy=df.copy()

In [None]:
df_copy.head()

In [None]:
import seaborn as sns 

sns.countplot("category",data=df_copy)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

labels=le.fit_transform(df_copy.category)

In [None]:
labels

In [None]:

"""
4==Siyaset
0==Dunya
1==ekonomi
2==kultur
3==saglik
5==spor
6==teknoloji

"""

In [None]:
from nltk.corpus import stopwords

nltk.download("stopwords")

In [None]:
text_list=[]

for text in df_copy.text:
    text = text.lower()  #Büyük harften -Küçük harfe çevirme
    text = re.sub("[^abcçdefgğhıijklmnoöprsştuüvyz]"," ",text)
    text=nltk.word_tokenize(text) # splits the words that are in the sentence from each other.
    text =[word for word in text if not word in set(stopwords.words("turkish"))]
    lemma=nlp.WordNetLemmatizer()
    text=[lemma.lemmatize(word) for word in text] # this code finds the root of the word for a word in the sentence and change them to their root form.
    text=" ".join(text)
    text_list.append(text) # store sentences in list

In [None]:
len(text_list)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer #Bag of Words

max_features=500 # "number" most common(used) words in reviews

count_vectorizer=CountVectorizer(max_features=max_features) 

sparce_matrix=count_vectorizer.fit_transform(text_list).toarray()

In [None]:
sparce_matrix.shape #4900 sentences 500 most used words

In [None]:
sparce_matrix[0:10,0:20]

In [None]:
print("Top {} the most used word by reviewers: {}".format(max_features,count_vectorizer.get_feature_names()))

In [None]:
data=pd.DataFrame(count_vectorizer.get_feature_names(),columns=["Words"])

In [None]:
data.head()

In [None]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
plt.subplots(figsize=(12,12))
wordcloud=WordCloud(background_color="black",width=1024,height=768).generate(" ".join(data.Words[5:]))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
X=sparce_matrix
y=labels

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print("x_train",X_train.shape)
print("x_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

# LightGBM 

In [None]:
lgbm_model=LGBMClassifier()

lgbm_model.fit(X_train,y_train)

In [None]:
y_pred=lgbm_model.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

## Model Tunning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
lgbm_params={"n_estimators":[100,500,100,2000],
            "subsample":[0.6,0.8,1.0],
            "learning_rate":[0.1,0.01,0.02,0.05],
            "min_child_samples":[5,20,10],
            "max_depth":[3,4,5,6]}

lgbm=LGBMClassifier()
lgbm_cv=RandomizedSearchCV(lgbm,lgbm_params,cv=10,n_jobs=-1,verbose=2)
lgbm_cv_model=lgbm_cv.fit(X_train,y_train)

In [None]:
lgbm_cv_model.best_params_

In [None]:
lgbm=LGBMClassifier(learning_rate= 0.01,max_depth= 3,min_child_samples= 10,n_estimators= 2000,subsample= 0.6)
lgbm_tuned_model=lgbm.fit(X_train,y_train)

In [None]:
y_pred=lgbm_tuned_model.predict(X_test)

In [None]:
cm=confusion_matrix(y_test,y_pred)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(cm,annot=True,fmt='d') #835 true predictions

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model=RandomForestClassifier(random_state=42)
rf_model.fit(X_train,y_train)

In [None]:
y_pred=rf_model.predict(X_test)

print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

cm=confusion_matrix(y_test,y_pred)

plt.figure(figsize=(10,10))
sns.heatmap(cm,annot=True,fmt='d') 

## Model Tunning

In [None]:
rf_params={"max_depth":[2,5,8,10],
           "max_features":[2,5,8],
           "n_estimators":[10,500,1000],
          "min_samples_split":[2,5,10]}

In [None]:
rf=RandomForestClassifier(random_state=42)
rf_cv=RandomizedSearchCV(rf,rf_params,cv=10,n_jobs=-1,verbose=2)
rf_cv_model=rf_cv.fit(X_train,y_train)

In [None]:
rf_cv_model.best_params_

In [None]:
rf_tuned_model=RandomForestClassifier(random_state=42,max_depth=10,max_features= 2,min_samples_split= 5,n_estimators= 1000)

In [None]:
rf_tuned_model.fit(X_train,y_train)

y_pred=rf_tuned_model.predict(X_test)

print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

cm=confusion_matrix(y_test,y_pred)

plt.figure(figsize=(10,10))
sns.heatmap(cm,annot=True,fmt='d') 

## XGBoost Classifier

In [None]:
xgb=XGBClassifier()

xgb_model=xgb.fit(X_train,y_train)

y_pred=xgb_model.predict(X_test)

print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

cm=confusion_matrix(y_test,y_pred)

plt.figure(figsize=(10,10))
sns.heatmap(cm,annot=True,fmt='d') 

In [None]:
print(classification_report(y_test, y_pred)) 
# classification report is good function for seeing how well our model predict labels in each class  

I didt tune my Xgboost model because it took too much time.

In [None]:
X_test[10] # I use this text for prediction

In [None]:
y_test[10] # true label of X_test[10] 3== health

In [None]:
xgb_model.predict(X_test[10].reshape(-1,500)) # predicted label of X_test[10] "Predicted Correctly"

In [None]:
import collections 

for index,liste in enumerate(sparce_matrix):
    if collections.Counter(liste) == collections.Counter(X_test[10]): # searching X_test[10] in space matrix and Turn its index
        print(index)
        
        
    

In [None]:
count_vectorizer.inverse_transform(sparce_matrix[2827]) 
# these are words from X_test[10](tokenized and cleared) and as you see there are related word with health(3) like ilaç==medicine,bilim==science

In [None]:
df_copy.text[2827] # original text 

## Turkish Text Classification with Deep Learning

In [None]:
X=df_copy.text.copy()


In [None]:
X.head()

In [None]:
X_list=[] # store text in this list

for text in X:
    text = text.lower()
    text = re.sub("[^abcçdefgğhıijklmnoöprsştuüvyz]"," ",text)
    text = text.split()
    text =[word for word in text if not word in set(stopwords.words("turkish"))]
    text=" ".join(text)
    X_list.append(text) # store sentences in list

In [None]:
y_label=[]

for i in labels:
    y_label.append(i)

In [None]:
y_label[0:10]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_list, y_label, test_size=0.2, random_state=42,shuffle=True)

In [None]:
print(len(X_train)) # 3920 sentences
print(len(y_train)) #3920 Labels
print(len(X_test))  # 980 sentences
print(len(y_test)) # 980 labels

In [None]:
max_lenght=100

tokenizer = Tokenizer() 
tokenizer.fit_on_texts(X_train)


word_index = tokenizer.word_index # creating word dict for words in training

sequences = tokenizer.texts_to_sequences(X_train)  # replacing words with the number corresponding to them in the dictionary(word_index)

X_train_padded = pad_sequences(sequences, padding='post',maxlen=max_lenght) # padding words

print(len(word_index)) # I have 94836 words in my dictionary

In [None]:
print("Original Version:",X_train[0])
print("---------------------------------")
print("Padded version",X_train_padded[0]) 
print("---------------------------------")
print("Tokenized version:",sequences[0])  # change words with number that corresponding to word word_index
print("---------------------------------")
print("Shape after the padding:",X_train_padded.shape) # make our input same size

In [None]:
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences,padding="post",maxlen=max_lenght)

In [None]:
print("Original Version:",X_test[0])
print("---------------------------------")
print("Padded version",X_test_padded[0])   # make inputs same size
print("---------------------------------")
print("Tokenized version:",X_test_sequences[0]) # change words with number that corresponding to word word_index
print("---------------------------------")
print("Shape after the padding:",X_test_padded.shape) # make inputs same size

In [None]:
import tensorflow as tf

vocab_size = len(tokenizer.word_index)+1
embedding_dim=16

model = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=100),
    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.BatchNormalization(),
    
    tf.keras.layers.Flatten(),
    
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(7, activation='softmax')
])

adam= tf.keras.optimizers.Adam(lr=0.01) 


model.compile(loss='sparse_categorical_crossentropy',optimizer=adam,metrics=['accuracy'])
model.summary()

In [None]:
y_train_label=np.asarray(y_train).reshape(-1,1)
y_test_label=np.asarray(y_test).reshape(-1,1)

In [None]:
num_epochs = 10
BATCH_SIZE=64
history=model.fit(X_train_padded,y_train_label,batch_size=BATCH_SIZE ,epochs=num_epochs, validation_data=(X_test_padded,y_test_label))

In [None]:
import matplotlib.pyplot as plt


plt.plot(history.history["accuracy"],color="green")
plt.plot(history.history["loss"],color="red")
plt.title("Train accuracy and Train loss")
plt.legend(["Accuracy","Loss"])
plt.grid()

In [None]:
plt.plot(history.history["val_accuracy"],color="blue")
plt.plot(history.history["val_loss"],color="orange")
plt.title("Test accuracy and Test loss")
plt.legend(["Val_accuracy","Val_loss"])
plt.grid()

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(X_train_padded,y_train_label)[1]*100 , "%")
print("Accuracy of the model on Testing Data is - " , model.evaluate(X_test_padded,y_test_label)[1]*100 , "%")

LSTM model is overfitted on training data 

In [None]:
pred = model.predict_classes(X_test_padded)

In [None]:
pred[10:20]

In [None]:
y_test_label[10:20]

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,pred)
plt.figure(figsize=(10,10))
sns.heatmap(cm,annot=True,linecolor="white",fmt='')


In [None]:
print(classification_report(y_test_label,pred)) # model is better when classify 3 and 5 classes

So,This is end of my notebook. Thanks for you to took a look at  my notebook.I hope you like it :)

I changed my LSTM model more than 15 times but I didnt prevent overfitting. 
I think machine learning algorithm is much better than LSTM model for this dataset but You may achieve higher score then me.

if you have a solution please feel free to write your thoughts on comment section