**Metin Sınıflandırma (Text Classification)**
verilerin bir metinin içeriğine göre belirli kategoriye atanması işlemi

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("spam.csv", encoding="latin1")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


*   **axis=0** → satırlar
*   **axis=1** → sütunlar
*   **inplace=True**, data üzerinde doğrudan değişiklik yapılmasını sağlar.
*   data.columns = ["label", "text"] Sütun adlarını tamamen yenileriyle değiştirir





In [None]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
data.columns=["label","text"]
data.head(20)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
# missing value (kayıp değer) kontrolü
print(data.isna().sum())

label    0
text     0
dtype: int64


In [None]:
#metin temizleme ve önişleme: özel karakterler, lowercase, tokenization, stopwords, lemmatize
import nltk
nltk.download("stopwords") #çok kullanılan ve bir anlam taşımayan sözcükleri metin içinden çıkarma
nltk.download("wordnet") #kök bulmak için gereklli veriseti
nltk.download("omw-1-4") #wornete ait farklı dillerin kelime anlamlarını içeren bir veri seti

import re
from nltk.corpus import stopwords # stopwordsler için
from nltk.stem import WordNetLemmatizer #lemmatization

text=list(data.text)
lemmatizer = WordNetLemmatizer()

corpus=[]
for i in range(len(text)):

  r=re.sub("[^a-zA-Z]"," ",text[i]) #metin içerisinde harf olmayan tüm karekterleri çıkartma
  r=r.lower() #büyük harfi küçük harfe dönüştürme
  r=r.split() #kelimeleri ayırma
  r=[word for word in r if word not in stopwords.words("english")] #stopwordsleri çıkartma
  r=[lemmatizer.lemmatize(word) for word in r]
  r=" ".join(r)
  corpus.append(r)

data["text2"]=corpus

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading omw-1-4: Package 'omw-1-4' not found in
[nltk_data]     index


In [None]:
print(data["text2"])

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts st ...
3                     u dun say early hor u c already say
4                     nah think go usf life around though
                              ...                        
5567    nd time tried contact u u pound prize claim ea...
5568                            b going esplanade fr home
5569                                 pity mood suggestion
5570    guy bitching acted like interested buying some...
5571                                       rofl true name
Name: text2, Length: 5572, dtype: object


In [None]:
#veri setini eğitim ve test veri setine bölmek
X= data["text2"]
y= data["label"] #target variable

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.20, random_state=1) #random_state=1 her çalıştırmada aynı eğitim ve test setini elde edersin

In [None]:
# Özellik çıkarımı: bag of words
from sklearn.feature_extraction.text import CountVectorizer #BoW için CountVectorizer sınfını içeri aktarma
cv= CountVectorizer() #CountVectorizer nesnesini oluşturma (kelima sözlüğü + sayım işlemi)
X_train_cv= cv.fit_transform(X_train) # Eğitim verisi: önce vocabulary oluştuma(fit), sonra sayısal matrise dönüşütrme(transform)
X_test_cv= cv.transform(X_test) #test verisini, eğitimde oluşturulan vocabulary'e göre dönüşütür

In [None]:
#model eğitimi
from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier()
dt.fit(X_train_cv, y_train) #eğitim


In [None]:
#model test
prediction =dt.predict(X_test_cv)

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

c_matrix = confusion_matrix(y_test, prediction)
print(c_matrix)
print("Başarı ", accuracy_score(y_test, prediction))
print("Performans metrikleri \n ", classification_report(y_test, prediction))

[[962  14]
 [  9 130]]
Başarı  0.979372197309417
Performans metrikleri 
                precision    recall  f1-score   support

         ham       0.99      0.99      0.99       976
        spam       0.90      0.94      0.92       139

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115

