In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install Sastrawi

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
# Random seed for consistency
np.random.seed(42)

nltk.download('punkt')

## Stemmer
Kita dapat menggunakan Stemmer untuk mengurangi jumlah word yang ada pada dataset
### Contoh Hasil Stem
Perekonomian -> ekonomi
pertumbuhan -> tumbuh

## Tf-idf (Term Frequency-Inverse document frequency)

Term Frequency(tf) adalah jumlah kemunculan term

In [None]:
# Create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Label Encoder use to Encode target labels with value between 0 and n_classes-1
Encoder = LabelEncoder()

# TfidfVectorizer Convert a collection of raw documents to a matrix of TF-IDF features.
Tfidf_vect = TfidfVectorizer()

## Import Dataset

### Feature
ID - ID Data

label - 1 for False news, 0 for True fact

tanggal - tanggal narasi ditemukan

judul - judul/tagline berita yang tersebar

narasi - isi berita

nama file gambar - file gambar (tidak digunakan pada tutorial ini)



In [None]:
# Read dataset using pandas function read_csv

data_train = pd.read_csv('../input/indonesiafalsenews/Data_latih.csv')

In [None]:
# Bentuk data_train

data_train

In [None]:
# Jumlah text per-kategori(label)

data_train['label'].value_counts()

## Data Preprocessing

In [None]:
# Dari cell sebelumnya terlihat jelas bahwa dataset kita sangat tidak balance
# Untuk membuat datasetnya balance

# Pilih dataset dengan label 1 dan lakukan randomisasi untuk setiap baris datanya
false_news = data_train[data_train['label'] == 1].sample(frac=1)

# Concat dataset berlabel 1 yang telah dipilih dengan dataset berlabel 0
# dimana jumlah dataset berlabel 1 yang digabungkan sejumlah banyak dataset berlabel 0 + 200
true_fact = data_train[data_train['label'] == 0]
df = true_fact.append(false_news[:len(true_fact) + 200])

df

## Feature Engineering

In [None]:
# Kita akan menggunakan fitur narasi saja dalam melakukan prediksi terhadap label
feature = df['narasi']
label = df['label']

In [None]:
# Mengubah semua huruf pada setiap baris menjadi huruf kecil dan
# melakukan stemming pada setiap baris
lower = [stemmer.stem(row.lower()) for row in feature]

# Hasil stem dan lower
lower[:5]

In [None]:
# Melakukan tokenisasi untuk setiap baris dataset
tokens = [word_tokenize(element) for element in lower]

# Hasil tokenisasi setiap baris
tokens[:5]

In [None]:
# train_test_split digunakan untuk memecah dataset menjadi 2 bagian
# X_train dan y_train mewakili data yang akan dilakukan pada fitting model(Training model)
# X_test dan y_test  mewakili data yang akan dilakukan pada evaluasi model
X_train, X_test, y_train, y_test = train_test_split(tokens, label, test_size=0.3, stratify=label)

In [None]:
# Melihat ukuran data latih dan data uji
print('X_train : ', len(X_train))
print('X_test : ', len(X_test))

In [None]:
# Encoder for Data Label
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

y_train

In [None]:
# Fitting dataset terhadap tf-idf
Tfidf_vect.fit(["".join(row) for row in X_train])

In [None]:
# Mentransformasikan hasil fitting terhadap data X_train dan X_test
X_train_Tfidf = Tfidf_vect.transform([" ".join(row) for row in X_train])
X_test_Tfidf = Tfidf_vect.transform([" ".join(row) for row in X_test])

## Model Training & Prediction

#### Dengan menggunakan Algoritma SVM

In [None]:
# Classifier - Algorithm - SVM
# fitting/training datasets pada algoritma SVM(Support Vector Machine)
SVM = svm.SVC(C=1.0, kernel='linear', degree=1, gamma="auto", verbose=True)
SVM.fit(X_train_Tfidf, y_train)  # predict the labels on validation dataset

# Menggunakan metrics accuracy untuk melihat performa model
predictions_SVM = SVM.predict(X_test_Tfidf)
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, y_test)*100)

#### Dengan Menggunakan Algoritma RandomForest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_Tfidf, y_train)

prediction_rf = rf.predict(X_test_Tfidf)
print("RandomForest Accuracy Score -> ", accuracy_score(prediction_rf, y_test)*100)