In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt #grafik çizme
import re #düzenli ifadeler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score #Başarı skoru ölçümleri


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/siber-zorbalk/tweetset.csv",encoding="windows-1254")
df.head()

# Veri setinde kayıp verilerin olup olmadığına bakıyoruz ve düzeltilemeyecek kadar olan feature'leri siliyoruz
print("Kayip Veriler :{}".format(df.isnull().sum()))


df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4","Unnamed: 5"],axis=1,inplace=True)

#Label encoder işlemi yaparak veri seti içerisinde bulunan "Negatif" değerli 0 "Pozitif" değerleri ise 1 yapıyoruz.
df["sinif"] = [0 if (i=="Negatif") else 1 for i in df["Tip"]]
df.head()

In [None]:

#Özel Karakterlerin temizlenmesi
def ozel_karakter (text):
    punctation = string.punctuation
    return text.translate(str.maketrans("","",punctation))


def clean(tweet):
    #Urller
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet)
    #noktalama işaretleri
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        tweet = tweet.replace(p, ' ')
        
    return tweet

#STOP WORDS temizleme
def stop_words_temizle (text):
    words = set(stopwords.words("turkish"))
    return (' '.join([i for i in text if i not in words and not i.isnumeric()]))


df["ozel_karakter"] = df["Paylaşım"].apply(lambda x: ozel_karakter(x))
df["ozel_karakter"] = df["ozel_karakter"].str.split()
df["stop_word"] = df["ozel_karakter"].apply(lambda x : stop_words_temizle(x))
df["stop_word"] = df['stop_word'].apply(lambda x : clean(x))

df['stop_word']= df['stop_word'].str.lower()
df.head()

In [None]:
#pasta grafiği oluşturma
disaster=len(df[df["sinif"]==1]['sinif'])
not_disaster=len(df[df["sinif"]==0]['sinif'])
slices = [disaster,not_disaster]
labels = ["Pozitif","Negatif"]
colors =["g","r"]

plt.figure(figsize=(10,10))
plt.pie(slices,
       labels=labels,
       colors=colors,
       startangle=90,
       shadow=True,
       explode=(0,0.1),
       autopct="%1.1f%%")
plt.show()

In [None]:
# VERİ SETİ İÇERİSİNDE İHTİYACIMIZ OLMAYAN SÜTUNLARI SİLİYORUZ
df.drop(["ozel_karakter","Paylaşım","Tip"],inplace=True,axis=1)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

#Eğitim veri kümesini %80 eğitim ve %20 test olarak bölme
X_train,X_test,Y_train,Y_test = train_test_split(df["stop_word"].values.astype('U'),
                                                 df["sinif"],
                                                 test_size=0.2,
                                                 random_state=100)

In [None]:
# COUNT VECTORİZE İLE SAYMA VEKTÖRLERİ OLUŞTURMA
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)

In [None]:
# TFİIDF VEKTÖRÜ OLUŞTURMA SAYMA VEKTÖRLERİNİ TFIDF VEKTÖRLERİNE DÖNÜŞTÜRÜCEZ

from sklearn.feature_extraction.text import TfidfTransformer #TF-İDF
tfidf =TfidfTransformer() 
X_train_tfidf = tfidf.fit_transform(X_train_counts)

print(X_train_tfidf.shape)

In [None]:


from sklearn.naive_bayes import MultinomialNB

# NAIVE BAYES SINIFLANDIRICI EĞİTİYORUZ
mnb = MultinomialNB()
clf = mnb.fit(X_train_counts, Y_train)

from sklearn.linear_model import LogisticRegression

#Logistic Regression algoritmasıyla oluşturduğumuz tahminlerimizi eğitim veri setinin ayırdığımız %20 kısmı üzerinde test ediyoruz.
lgr = LogisticRegression()
lgr.fit(X_train_tfidf,Y_train)


In [None]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

In [None]:
y_pred = clf.predict(X_test_counts )

y_pred2= lgr.predict(X_test_tfidf)

for text, sentiment in zip(X_test[:5],y_pred[:]):
    print("\n %r => %s" %(text,sentiment ))
    
for text, sentiment in zip(X_test[:5],y_pred2[:]):
    print("\n %r => %s" %(text,sentiment ))

In [None]:
#Test Sonucunu görüyoruz.
accuracy_score(Y_test, y_pred2)


In [None]:
#Test Sonucunu görüyoruz.
accuracy_score(Y_test, y_pred)

In [None]:
# Test Result
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print(f"Accuracy Score %{round(accuracy_score(Y_test,y_pred)*100,2)}")
print(f"Accuracy Score %{round(accuracy_score(Y_test,y_pred2)*100,2)}")
print(f"f-1 Score = %{round(f1_score(Y_test,y_pred)*100,2)}")
print(f"f-1 Score = %{round(f1_score(Y_test,y_pred2)*100,2)}")
print(f"Precision = %{round(precision_score(Y_test,y_pred)*100,2)}")
print(f"Precision = %{round(precision_score(Y_test,y_pred2)*100,2)}")
print(f"Recall Score = %{round(recall_score(Y_test,y_pred)*100,2)}")
print(f"Recall Score = %{round(recall_score(Y_test,y_pred2)*100,2)}")