In [1]:
# import library

import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import os
import jieba
from jieba.analyse.analyzer import ChineseAnalyzer
from jieba import tokenize

In [2]:
# load data as dataframe

def load_dataframe(filename):
    '''
    Return dataframe
    Load pickle as dataframe. 'filename' is the file name
    '''
    df = pd.read_pickle(filename)
    return df

In [3]:
# dataframe to list

def load_data_from_pkl(name,x):
    '''
    Return list
    Load data from pickle. 'name' is file name. 'x' is column name
    '''
    df = load_dataframe(name)
    headline = df.filter([x])
    headline = headline.to_numpy()
    headline = headline.tolist()

    return headline

In [4]:
# remove trash content

def remove_trash(temp):
    '''
    Return str
    Remove unwanted characters/sub strings
    '''
    temp = str(temp)                                    # ensure input is string type
    temp = re.sub(r'\n','',temp)                        # \n
    temp = re.sub(r'\\n','',temp)                       # \\n
    temp = re.sub(r'=====Shared Post=====','',temp)     
    temp = re.sub(r'http\S+', '', temp)                 # website
    temp = re.sub(r'/','',temp)                         # /
    temp = re.sub(r'＊','',temp)
    temp = re.sub(r'\[.*?\]','',temp)                   # emoji
    temp = re.sub(r"\u3000",'',temp)
    temp = re.sub(r'--','',temp)                        # consecutive -
    temp = re.sub(r"\*",'',temp)                        # *
    temp = re.sub(r'➤\S+','',temp)                     # ➤xxxx 
    temp = re.sub(r'\u200b','',temp)
    temp = re.sub(r'＝＝','',temp)
    temp = temp.lstrip()                                # beginning space
    return temp

In [5]:
def clean_content(file,col):
    '''
    Return dataframe
    'file' is file name. 'col' is target column.
    '''
    df = load_dataframe(file)
    df[col] = df[col].apply(remove_trash)
    return df

In [6]:
def df_col_to_list(df,col):
    '''
    Return list
    'df' is dataframe. 'col' is column name
    '''
    df = df.filter([col])
    df = df.to_numpy()
    dflist = df.tolist()
    return dflist

In [7]:
def combine_result(cleanlist,original):
    '''
    Return dataframe
    Concatenate clean and original content as dataframe for later use
    'cleanlist' is list with clean content. 'original' is orginal list.
    '''
    result = pd.DataFrame({'Clean':cleanlist,'Original':original})
    return result

In [8]:
# save result
def save_as_pkl(name,df):
    '''
    No return
    Save file as name.pkl
    '''
    name = name + '.pkl'
    
    if os.path.exists(name):
        os.remove(name)

    df.to_pickle(name)


In [9]:
def preprocess(filename,col_name):
    '''
    No return
    Remove unwanted words and save as pkl
    '''
    df = clean_content(filename,col_name)
    df = df.filter([col_name])
    cleanlist = df_col_to_list(df,col_name)
    
    df2 = pd.read_pickle(filename)
    df2 = df2.filter([col_name])
    original = df_col_to_list(df2,col_name)
    
    result_df = combine_result(cleanlist,original)
    
    save_as_pkl('clean_'+col_name,result_df)
    

In [10]:
preprocess('analytics_challenge_dataset_ex211008.pkl','headline')
preprocess('analytics_challenge_dataset_ex211008.pkl','content')

AttributeError: Can't get attribute 'new_block' on <module 'pandas.core.internals.blocks' from 'C:\\Users\\iclem\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\pandas\\core\\internals\\blocks.py'>

Jieba


In [5]:
# Read and load clean data

def load_clean_data(filename):
    '''
    Return dataframe  
    'filename' is the file name of the clean pkl
    '''
    clean = pd.read_pickle(filename)
    
    return clean

In [18]:
# Tokenize

def tokenization(text):
    '''
    Return str
    '''
    text = re.sub(' ','',text)
    text = list(jieba.cut(text))
    text = [re.sub(r'[^\w]', '', i) for i in text if re.sub(r'[^\w]', '', i) != '']
    return ' '.join(text)

In [19]:
# Tokenizing

def tokenize(filename,col_name):
    '''
    Return dataframe
    'filename' is the clean file name. 'col_name' is the column name
    '''
    df = load_clean_data(filename)
    df[col_name] = df[col_name].dropna().astype(str)
    df[col_name] = df[col_name].apply(tokenization)
    
    return df

In [20]:
# Save tokenize result

def save_tokenized(name,df):
    '''
    No return
    Save tokenized result as name_tokenized.pkl
    '''
    if os.path.exists(name+'_tokenized.pkl'):
        os.remove(name+'_tokenized.pkl')
    resultdf = pd.DataFrame(df)
    resultdf.to_pickle(name+'_tokenized.pkl')

In [98]:
df = tokenize('clean_content.pkl','Clean')
# 23m 32.2s for 444281 rows

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\iclem\AppData\Local\Temp\jieba.cache
Loading model cost 0.570 seconds.
Prefix dict has been built successfully.


In [None]:
result = []
for i in tqdm(range(10000)):
    temp = str(clean_content[i])
    temp = re.sub(' ','',temp)
    tempresult = []
    for word in jieba.cut(temp):
        tempstr = str(word)
        tempstr = re.sub(r'[^\w]', '', tempstr)
        if (tempstr != ''):
            tempresult.append(tempstr)
    result.append(tempresult)

In [104]:
save_tokenized('content',df)

In [105]:
test = pd.read_pickle('content_tokenized.pkl')

In [177]:
jieba.add_word('擁抱')

Decision Tree


In [6]:
# Manually create label

df = load_clean_data('clean_content.pkl')
df = df.filter(['Clean'])

In [173]:
df = df.to_numpy()
df = df.tolist()

In [174]:
for i in range(len(df)):
    if ('香港' in str(df[i])) or ('本港' in str(df[i])):
        df[i].append('Hong Kong')
    elif ('內地' in str(df[i])) or ('大陸' in str(df[i])) or ('中國' in str(df[i])):
        df[i].append('Mainland')
    else:
        df[i].append('')

In [175]:
for i in range(len(df)):
    df[i][0] = str(df[i][0])

In [176]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data = pd.DataFrame(df,columns=['content','label'])

In [177]:
y = data['label']
X = data.drop(['label'], axis=1) 


In [178]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [179]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)
# does not support chinese
# convert to vector first?

Sentence transformers test


In [5]:
from sentence_transformers import SentenceTransformer, models, SentencesDataset, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers import evaluation

model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

Downloading: 100%|██████████| 345/345 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 3.74k/3.74k [00:00<00:00, 3.75MB/s]
Downloading: 100%|██████████| 718/718 [00:00<00:00, 720kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 122kB/s]
Downloading: 100%|██████████| 229/229 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 1.11G/1.11G [00:23<00:00, 47.6MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 53.2kB/s]
Downloading: 100%|██████████| 5.07M/5.07M [00:09<00:00, 559kB/s] 
Downloading: 100%|██████████| 150/150 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 9.10M/9.10M [02:06<00:00, 72.0kB/s]
Downloading: 100%|██████████| 550/550 [00:00<00:00, 552kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 191kB/s]


In [24]:
sentenceembedding = model.encode(df[0][0])

NameError: name 'df' is not defined