# **Data Mining Project Based-3 - Classification**

**BBC Full Text Document Classification**
https://raw.githubusercontent.com/riezaf51/bbc-associative-classification/main/bbc-classification.zip

Kelompok 3:

* Adam Ichwanul Ichsan - 130120041
* Muhamad Syaepul Huda - 1301200227
* Muhammad Rieza Fachrezi - 1301204335

## Import Library

In [None]:
import os
import sys
import re
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

## Import Dataset

In [None]:
!wget https://raw.githubusercontent.com/riezaf51/bbc-associative-classification/main/bbc-classification.zip
!unzip -o bbc-classification.zip -d bbc-classification
!rm bbc-classification.zip

--2023-12-21 14:12:15--  https://raw.githubusercontent.com/riezaf51/bbc-associative-classification/main/bbc-classification.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5864242 (5.6M) [application/zip]
Saving to: ‘bbc-classification.zip’


2023-12-21 14:12:16 (113 MB/s) - ‘bbc-classification.zip’ saved [5864242/5864242]

Archive:  bbc-classification.zip
  inflating: bbc-classification/bbc-fulltext (document classification)/bbc/README.TXT  
  inflating: bbc-classification/bbc-fulltext (document classification)/bbc/business/001.txt  
  inflating: bbc-classification/bbc-fulltext (document classification)/bbc/business/002.txt  
  inflating: bbc-classification/bbc-fulltext (document classification)/bbc/business/003.txt  
  inflating: bbc-classification/bbc-

### Turn raw data into structured data

In [None]:
def read_extract_text_file(path):
  with open(path,'r',encoding='latin-1') as file:
    data = file.readlines()
    text_data= " ".join(data)
  return text_data

# class_labels = {'politics':0, 'sport':1,  'tech':2, 'entertainment':3, 'business':4}

In [None]:
path = "bbc-classification/bbc"
folder = os.listdir(path)

final_text = []
final_label = []

for label in folder:
  #print(label)
  new_path = os.path.join(path,label)
  if new_path.endswith("README.TXT"):
    #print(new_path)
    continue
  else:
    for j in os.listdir(new_path):
      #print(j)
      new_path1 = os.path.join(new_path,j)
      text = read_extract_text_file(new_path1)
      final_text.append(text)
      final_label.append(label)

In [None]:
df = pd.DataFrame()
df['Text'] = final_text
df['Label'] = final_label
df

Unnamed: 0,Text,Label
0,Russian oil merger excludes Yukos\n \n The mer...,business
1,Buyers snap up Jet Airways' shares\n \n Invest...,business
2,UK 'risks breaking golden rule'\n \n The UK go...,business
3,"Dutch bank to lay off 2,850 staff\n \n ABN Amr...",business
4,Worldcom director ends evidence\n \n The forme...,business
...,...,...
2220,Green fear for transport ballot\n \n The Green...,politics
2221,Brown ally rejects Budget spree\n \n Chancello...,politics
2222,Blair Labour's longest-serving PM\n \n Tony Bl...,politics
2223,Conservative backing for ID cards\n \n The Tor...,politics


In [None]:
df['Label'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: Label, dtype: int64

## Data pre-processing

In [None]:
def clean_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Split the text into words
    words = cleaned_text.split()
    # Remove words with length less than 2 and not in stopwords
    cleaned_words = [word for word in words if len(word) > 1]
    return ' '.join(cleaned_words)

# Clean the text
df['Cleaned_Text'] = df['Text'].apply(lambda x: clean_text(x))

In [None]:
# Assuming 'df' is your DataFrame
nltk.download('punkt')  # Download the punkt tokenizer if not downloaded

df['Tokenized_Text'] = df['Cleaned_Text'].apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df['Lowercase_Text'] = df['Tokenized_Text'].apply(lambda x: [word.lower() for word in x])

In [None]:
def remove_punctuation(tokens):
    return [word for word in tokens if word not in string.punctuation]

df['No_Punctuation_Text'] = df['Lowercase_Text'].apply(remove_punctuation)

In [None]:
nltk.download('stopwords')  # Download stopwords if not downloaded

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['No_Stopwords_Text'] = df['No_Punctuation_Text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
nltk.download('wordnet')  # Download WordNet if not downloaded

lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in list(set(tokens)) if len(word) > 2 and word != 'said']

df['Lemmatized_Text'] = df['No_Stopwords_Text'].apply(lemmatize_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df_copy = df.copy()

## Model Implementation

In [None]:
class AssociativeClassification:
  def __init__(self, rules_set):
    self.clf = DecisionTreeClassifier()
    self.rules_set = rules_set

  def fit(self, x, y):
    self.clf.fit(x, y)

  def predict(self, x, y):
    predictions = []
    for index, instance in x.iterrows():
      # Apply association rules for classification
      matching_rules = self.rules_set[self.rules_set['antecedents'].apply(lambda x: set(x).issubset(set(instance)))]

      if not matching_rules.empty:
        predicted_labels = matching_rules['consequents'].values.flatten()
        prediction = predicted_labels[0]  # Just take the first label for demonstration
      else:
        # If no rules match, use the specified classifier
        predictions.append(self.clf.predict([instance])[0])

      # Evaluate accuracy (if true labels are available)
      accuracy = sum(1 for i, j in zip(predictions, y) if i == j) / len(predictions)
      sys.stdout.write(f"\rAccuracy: {accuracy*100:.2f}%, {len(predictions)}/{len(y)}")
      sys.stdout.flush()

    print()
    return predictions

## Model Evaluation

In [None]:
text_data = df_copy['Lemmatized_Text']
labels = df_copy['Label']

# Convert text data to transaction format
te = TransactionEncoder()
te_ary = te.fit(text_data).transform(text_data)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Frequent itemset mining
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

# Extract association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Prepare data for classification
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.25, random_state=42)

# Create the model
model = AssociativeClassification(rules)

# Fit the model with training data
model.fit(X_train, y_train)

# Predict validation data
validation_predictions = model.predict(X_test, y_test)

Accuracy: 85.82%, 557/557


In [None]:
model.rules_set

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(bbc),(told),0.204494,0.277303,0.144719,0.707692,2.552051,0.088012,2.472383,0.764493
1,(bbc),(would),0.204494,0.515056,0.143371,0.701099,1.361209,0.038045,1.622422,0.333572
2,(five),(year),0.169888,0.598652,0.124045,0.730159,1.219672,0.022341,1.48735,0.216968
3,(minister),(government),0.160449,0.214831,0.11236,0.70028,3.259672,0.07789,2.619674,0.825704
4,(government),(would),0.214831,0.515056,0.155056,0.721757,1.401318,0.044406,1.742881,0.364745
5,(last),(year),0.40809,0.598652,0.291236,0.713656,1.192106,0.046932,1.401632,0.272252
6,(market),(year),0.176629,0.598652,0.127191,0.720102,1.202873,0.021452,1.433908,0.204837
7,(million),(year),0.17618,0.598652,0.127191,0.721939,1.205941,0.021721,1.443381,0.207293
8,(minister),(would),0.160449,0.515056,0.125843,0.784314,1.522773,0.043202,2.248376,0.408913
9,(plan),(would),0.172135,0.515056,0.125843,0.73107,1.4194,0.037184,1.803238,0.356914
