In [23]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import csv
from tqdm import tqdm
import argparse
import time, sys
import re
import pickle
from sqlalchemy import create_engine

In [24]:
def load_data():
    con = create_engine('mysql://127.0.0.1/wikipedia_talk?read_default_file=~/.my.cnf&charset=utf8')

    en_full = pd.read_sql('select * from msgs_en_turns', con)
    es_full = pd.read_sql('select * from msgs_es_turns_trans', con)
    ja_full = pd.read_sql('select * from msgs_ja_turns_trans', con)
    zh_full = pd.read_sql('select * from msgs_zh_turns_trans', con)
    
    return en_full, es_full, ja_full, zh_full

def load_lexica():
    en_lex = pd.read_csv('Lexica/english_politelex.csv')
    es_lex = pd.read_csv('Expansion_Files/spanish_politelex_expanded.csv')
    ja_lex = pd.read_csv('Expansion_Files/japanese_politelex_expanded.csv')
    zh_lex = pd.read_csv('Lexica/chinese_politelex.csv')
    return en_lex, es_lex, ja_lex, zh_lex

en_full, es_full, ja_full, zh_full = load_data()
en_lex, es_lex, ja_lex, zh_lex = load_lexica()

In [None]:
def process_words(words):
    to_remove = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "(", ")", ",", ".", "-", "?", "!", "\\", "/"]
    processed_words = []
    for w in words:
        for r in to_remove:
            w = w.replace(r, "")
        processed_words.append(w.lower())
    return np.unique(processed_words).tolist()

def get_strategy_counts(messages, words):
    strategy_counts = []
    for message in messages:
        curr_count = 0
        for w in words:
            word_count = message.count(w)
            curr_count += word_count
        strategy_counts.append(curr_count)
    return strategy_counts

def get_word_counts(messages, word):
    word_counts = []
    for message in messages:
        count = message.count(word)
        word_counts.append(count)
    return word_counts

def purify_lexica(lexica_df, messages_df):
    categories = lexica_df["CATEGORY"].unique().tolist()
    messages = messages_df["turn"].tolist()
    messages = [m.lower() for m in messages if m is not None]
    
    final_lexica = {c:[] for c in categories}

    for category in categories:
        print(category)
        cat_words = lexica_df[lexica_df["CATEGORY"] == category]["word"].tolist()
        cat_words = process_words(cat_words)
        category_counts = get_strategy_counts(messages, cat_words)
        for word in cat_words:
            word_counts = get_word_counts(messages, word)
            print(word, np.corrcoef(category_counts, word_counts)[0,1])
            if(np.corrcoef(category_counts, word_counts)[0,1] > 0.15):
                final_lexica[category].append(word)
    return final_lexica

In [None]:
es_purified = purify_lexica(es_lex, es_full)
ja_purified = purify_lexica(ja_lex, ja_full)