In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [12]:
class TopicClassifier:
    topics = {
        "infrastruktur": ['jalan', 'jembatan', 'bandara', 'pelabuhan', 'kereta api', 'transportasi', 'listrik', 'energi', 'telekomunikasi'],
        "ekonomi": ['investasi', 'upah','perekonomian', 'hutang', 'utang', 'miskin','sengsara','pertumbuhan ekonomi', 'usaha', 'peluang bisnis', 'pajak', 'industri', 'perdagangan', 'pangan'],
        "lingkungan": ['lingkungan', 'polusi','sumber daya alam', 'hutan', 'air', 'sampah', 'bencana alam', 'kebakaran hutan', 'penyelamatan lingkungan'],
        "pendidikan": ['pendidikan', 'sekolah', 'guru', 'pembelajaran', 'penelitian', 'kampus', 'beasiswa', 'sistem pendidikan','bodoh'],
        "teknologi": ['teknologi', 'digitalisasi', 'internet', 'sains', 'inovasi', 'robotika', 'kecerdasan buatan', 'blockchain', 'platform digital']
    }

    @staticmethod
    def OneHotEncodingDataFrameClassifier(df, column_name):
        for topic in TopicClassifier.topics.keys():
            df[topic] = df[column_name].apply(lambda x: 1 if any(keyword in x for keyword in TopicClassifier.topics[topic]) else 0)
        return df
    
    def TextClassifying(input_text):
        topic_list = []
        for topic, keywords in TopicClassifier.topics.items():
            if any(keyword in input_text for keyword in keywords):
                topic_list.append(topic)
        return topic_list
    
    def CreateDictionaryPerTopic(dataframe, column):
        topic_dict = {topic: [] for topic in TopicClassifier.topics}
        for index, row in dataframe.iterrows():
            for topic, keywords in TopicClassifier.topics.items():
                if any(keyword in row[column] for keyword in keywords):
                    topic_dict[topic].append(row[column])
        return topic_dict

In [13]:
  def OneHotEncodingDataFrameClassifier(cls, dataFrame, text_column:str):
    '''
    Akan mengembalikan df dengan kolom text serta one hot encoding dari topik-topik yang didefinisikan.
    '''
    # Copying Dataframe
    df = dataFrame.copy()

    # Initializing final Data
    final_dict = {
        'text' : []
    }
    for topic in cls.topics.keys():
      final_dict[topic] = np.zeros(shape= (len(df),))
    
    print("Tahap 1")
    for i in tqdm(range(len(df))):
      text = df.loc[i][text_column]
      final_dict['text'].append(text)
      classification = cls.__classify__(text)
      for topic in classification:
        final_dict[topic][i] = 1

    final_df = pd.DataFrame(final_dict)
    
    # Deleting text with all zeros on topics
    print("Tahap 2")
    for i in tqdm(range(len(final_df))):
      jum = 0
      for topic in cls.topics.keys():
        jum += final_df.loc[i][topic]

      if jum == 0:
        final_df = final_df.drop(i)
    
    final_df = final_df.reset_index(drop=True)

    return final_df

  @classmethod
  def CreateDataFramePerTopic(cls, dataFrame, destinationPath, namaOutputFile, text_column):
    '''
    Akan membuat csv dari setiap topic yang sudah didefinisikan dan dari dataFrame yang dimasukkan, serta text berpacu kepada parameter (text_column)
    '''
    # Initialization
    df = dataFrame.copy()
    final_df_dict = {}
    for topic in cls.topics.keys():
      final_df_dict[topic] = pd.DataFrame(columns=df.columns)
    if destinationPath[len(destinationPath)-1] != "/":
      destinationPath += "/"
    
    # Classifying
    for i in tqdm(range(len(df))):
      text = df.loc[i][text_column]
      classification = cls.__classify__(text)
      for topic in classification:
        final_df_dict[topic] = final_df_dict[topic].append(df.iloc[i],ignore_index = True)

    # Output
    for topic in list(final_df_dict.keys()):
      final_df_dict[topic].to_csv("{}.csv".format((destinationPath + namaOutputFile + "_{}".format(topic))), index=False)

  @classmethod
  def CreateDictionaryPerTopic(cls, dataFrame, text_column):
    '''
    Akan mereturn dictionary yang berisikan setiap topic yang sudah didefinisikan dan dari dataFrame yang dimasukkan, serta text berpacu kepada parameter (text_column)
    '''
    # Initialization
    df = dataFrame.copy()
    final_df_dict = {}
    for topic in cls.topics.keys():
      final_df_dict[topic] = pd.DataFrame(columns=df.columns)
    
    # Classifying
    for i in tqdm(range(len(df))):
      text = df.loc[i][text_column]
      classification = cls.__classify__(text)
      for topic in classification:
        final_df_dict[topic] = final_df_dict[topic].append(df.iloc[i],ignore_index = True)

    # Output
    return final_df_dict


  @classmethod
  def TextClassifying(cls, text):
    return cls.__classify__(text)

  @classmethod
  def __classify__(cls, text):
    final_class = []
    text_split = text.split(' ')
    for i in range(len(text_split)):
      kata = text_split[i]
      for topic in cls.topics.keys():
        if kata in cls.topics[topic]:
          final_class.append(topic)
    
    return list(set(final_class))

In [20]:
import os
import pandas as pd

directory = 'C:\\Users\\user\\PENELITIAN SAINS DATA\\data\\predictV1'
out_directory = 'C:\\Users\\user\\PENELITIAN SAINS DATA\\data\\classifiedV1'

if not os.path.exists(out_directory):
    os.mkdir(out_directory)
    print("Directory Created at {}".format(out_directory))

files = os.listdir(directory)

for file in files:
    if file.endswith('.csv'):
        print(file)
        df = pd.read_csv(os.path.join(directory, file))
        
        # Convert 'word' column to string type
        df['word'] = df['word'].astype(str)
        
        df_temp = TopicClassifier.OneHotEncodingDataFrameClassifier(df, 'word')
        df_temp.to_csv(os.path.join(out_directory, file), index=False)

wordcount.csv


In [22]:
final_dict = TopicClassifier.CreateDictionaryPerTopic(df, 'word')

In [23]:
input = "rakyat miskin"#@param {type:"string"}
TopicClassifier.TextClassifying(input)

['ekonomi']