In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
df_train.drop(['keyword', 'location'], axis=1)

In [None]:
df_train.head()

In [None]:
print('Total data: ', df_train.shape[0], 'data\n')
print('Label:')
print('-- [0] Real: ', df_train[df_train.target == 1].shape[0], 'data')
print('-- [1] Fake: ', df_train[df_train.target == 0].shape[0], 'data')

**Case Folding**

In [None]:
import re

def casefolding(text):
  text = text.lower()                               # Mengubah teks menjadi lower case
  text = re.sub(r'https?://\S+|www\.\S+', '', text) # Menghapus URL
  text = re.sub(r'[-+]?[0-9]+', '', text)           # Menghapus angka
  text = re.sub(r'@[a-zA-Z0-9]*','', text)          # Menghapus tag username
  text = re.sub(r'[^\w\s]','', text)                # Menghapus karakter tanda baca
  text = text.strip()
  return text

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_eng = stopwords.words('english')

def remove_stop_words(text):
  clean_words = []
  text = text.split()
  for word in text:
      if word not in stopwords_eng:
          clean_words.append(word)
  return " ".join(clean_words)

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def stemming(text):
    text = ps.stem(text)
    return text

In [None]:
raw_sample = df_train['text'].iloc[2]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_words(case_folding)
text_stemming = stemming(stopword_removal)

print('Raw data\t\t: ', raw_sample)
print('Case folding\t\t: ', case_folding)
print('Stopword removal\t: ', stopword_removal)
print('Stemming\t\t: ', text_stemming)

In [None]:
def text_preprocessing_process(text):
  text = casefolding(text)
  text = remove_stop_words(text)
  text = stemming(text)
  return text

In [None]:
df_train['clean text'] = df_train['text'].apply(text_preprocessing_process)
df_train

In [None]:
x = df_train['clean text']
y = df_train['target']

x.shape, y.shape

In [None]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,1))
tfidf.fit(x)

In [None]:
x_tfidf = tfidf.transform(x).toarray()

In [None]:
import pickle

with open('tf_idf_feature.pickle', 'wb') as output:
  pickle.dump(x_tfidf, output)

In [None]:
x = np.array(x_tfidf)
y = np.array(y)

In [None]:
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 

# Ten features with highest chi-squared statistics are selected 
chi2_features = SelectKBest(chi2, k='all') 
x_kbest_features = chi2_features.fit_transform(x, y) 
  
# Reduced features 
print('Original feature number:', x.shape[1]) 
print('Reduced feature number:', x_kbest_features.shape[1])

In [None]:
feature = tfidf.get_feature_names_out()
mask = chi2_features.get_support()

In [None]:
new_feature = []

for bool, f in zip(mask, feature):
  if bool:
    new_feature.append(f)
  selected_feature = new_feature

selected_feature

In [None]:
kbest_feature = {} # Buat dictionary kosong

for (k,v) in tfidf.vocabulary_.items():    
  if k in selected_feature:                 
    kbest_feature[k] = v 

kbest_feature

In [None]:
with open('kbest_feature.pickle', 'wb') as output:
  pickle.dump(kbest_feature, output)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_kbest_features, y, test_size=0.25, random_state=100)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf_MNB = MultinomialNB()
model_MNB = clf_MNB.fit(x_train, y_train)

In [None]:
model_MNB_pred = model_MNB.predict(x_test)
model_MNB_pred

In [None]:
y_test

In [None]:
prediksi_benar = (model_MNB_pred == y_test).sum()
prediksi_salah = (model_MNB_pred != y_test).sum()

print('Jumlah prediksi benar\t:', prediksi_benar)
print('Jumlah prediksi salah\t:', prediksi_salah)

accuracy = prediksi_benar / (prediksi_benar + prediksi_salah) * 100
print('Akurasi pengujian\t:', accuracy, '%')

In [None]:
from sklearn.metrics import classification_report

print('Classification report:\n', classification_report(y_test, model_MNB_pred))

In [None]:
from joblib import dump

dump(model_MNB, filename="model_mnb.joblib")

In [None]:
df_test = df_test.drop(['keyword', 'location'], axis=1)

In [None]:
df_test.head()

In [None]:
x_test.shape, df_test.shape

In [None]:
from joblib import load

model = load('model_mnb.joblib')

vocab = pickle.load(open('kbest_feature.pickle', 'rb'))

In [None]:
testing_data = df_test.drop(['id'], axis=1)
testing_data

In [None]:
testing_data_arr = testing_data.to_numpy()
testing_data_arr

In [None]:
test_len = len(df_test)

In [None]:
from array import *

result_arr = array('i', [])
for i in range(test_len):
    input_text = df_test.text[i]
    pre_input_text = text_preprocessing_process(input_text)
    tfidf_vec = TfidfVectorizer(vocabulary=set(vocab))
    result = model.predict(tfidf_vec.fit_transform([pre_input_text]))
    result_arr.append(result)

In [None]:
submission = pd.DataFrame({'id':df_test['id'].to_list(), 'target':result_arr})
submission

In [None]:
submission.to_csv('./submission.csv', index=False)