# Import required packages

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from numpy.linalg import matrix_power

# Load dataset

In [2]:
df_train = pd.read_csv('../input/train.csv')
df_train.head()

# Check if dataset has missing data

In [3]:
shape_before = df_train.shape
df_train.dropna(inplace=True)
shape_after = df_train.shape

if shape_before == shape_after:
    print("None missing data found")
else:
    print('Found and removed missing data')

# Group dataset by Author

In [4]:
df_authors = df_train.groupby(by='author')
df_authors = df_authors.get_group('EAP')
authors_initials = ['EAP']

# Show Authors Initials in this dataset

In [5]:
authors_initials

# Tokenize

In [6]:
def tokenize(string, to_lower=True, is_alpha=True):
    if to_lower:
        if is_alpha:
            return [word.lower() for word in word_tokenize(string)]#if word.isalpha()] 
        else:
            return [word for word in word_tokenize(string)]
    return [word for word in word_tokenize(string) if word.isalpha()]

In [7]:
tokenize('test, one ,two, three. four .')

 # Get tokens for each Author

In [8]:
authors_tokens = {}

for author_initials in authors_initials:
    authors_tokens[author_initials] = []

for author_initials in authors_initials:
    group_text = df_authors['text']    
    for text in group_text:                        
        for word in tokenize(text):
            authors_tokens[author_initials].append(word)    

for author_initials in authors_initials:
    authors_tokens[author_initials] = np.unique(authors_tokens[author_initials])
            
for author_initials in authors_initials:
    print(f'author {author_initials} has {len(authors_tokens[author_initials])} unique tokens')

# Map token to index for each Author

In [9]:
authors_tokens_map = {}  
for author_initials in authors_initials:    
    tokens_author = len(authors_tokens[author_initials])
    dict_map = {}
    for token, token_index in zip(authors_tokens[author_initials], range(tokens_author)):
        dict_map[token] = token_index
    authors_tokens_map[author_initials] = dict_map

# Create Markov transition matrix for each Author

In [10]:
authors_matrix = {}
for author_initials in authors_initials:
    matrix_order = len(authors_tokens[author_initials])
    authors_matrix[author_initials] = np.zeros((matrix_order, matrix_order), dtype=np.float64)

# Add frequency transition for each token, for each Author, for each text

In [11]:
for author_initials in authors_initials:
    group_text = df_authors['text']    
    for text in group_text:        
        tokens = tokenize(text)
        try:            
            for i in range(len(tokens) - 1):
                token1, token2 = tokens[i], tokens[i + 1]                                              
                token1_index, token2_index = authors_tokens_map[author_initials][token1], authors_tokens_map[author_initials][token2]            
                authors_matrix[author_initials][token1_index][token2_index] += 1            
            token1, token2 = tokens[-1], tokens[0]        
            token1_index, token2_index = authors_tokens_map[author_initials][token1], authors_tokens_map[author_initials][token2]
            authors_matrix[author_initials][token1_index][token2_index] += 1
        except:
            print(author_initials, token1, token2)

# Make frequency transition matrix be probability transition matrix

In [12]:
authors_rows_sum = {}
for author_initials in authors_initials:
    authors_rows_sum[author_initials] = np.sum(authors_matrix[author_initials], axis = 0, dtype=np.float64)

In [13]:
for author_initials in authors_initials:    
    authors_matrix[author_initials] = np.divide(authors_matrix[author_initials], (authors_rows_sum[author_initials][None, :]), dtype=np.float64)

In [None]:
for author_initials in authors_initials:    
    print(f'{author_initials} {np.sum(authors_matrix[author_initials])}')

# Get emanation probability matrix

In [None]:
matrix_emanation = authors_matrix[author_initials]

done = False
pot = 1

print('pot \t unique_values')

while not done:
    matrix_emanation = matrix_power(matrix_emanation, 2)
    pot *= 2
    
    unique_values = np.sum([len(np.unique(matrix_emanation[:, i])) for i in range(len(matrix_emanation))])
    
    print(f'{pot} \t {unique_values}')
    
    if unique_values == matrix_emanation.shape[0]:
        done = True    

authors_matrix[author_initials] = matrix_emanation

In [None]:
# np.savetxt('emanation_eap.csv', matrix_emanation, delimiter=',')

In [None]:
# authors_matrix_emanation = {}
# for author_initials in authors_initials:    
#     for i in range(1, 65):
#         temp = matrix_power(authors_matrix[author_initials], i)
#         print(i, np.count_nonzero(temp), np.sum(temp, axies=0))        
# #     authors_matrix_emanation[author_initials] = np.power(authors_matrix[author_initials], 60, dtype=np.float64)        

In [None]:
# np.count_nonzero(authors_matrix_emanation['EAP']), authors_matrix_emanation['EAP'].shape

# Test sample

In [None]:
# sample = df_authors['text'][0]
# sample_tokens = tokenize(sample)

In [None]:
# for i in range(len(sample_tokens) - 1):   
#     token1, token2 = sample_tokens[i], sample_tokens[i + 1]
#     token1_index, token2_index = authors_tokens_map['EAP'][token1], authors_tokens_map['EAP'][token2]
#     prob = authors_matrix['EAP'][token1_index][token2_index]
#     prob_emanation = authors_matrix_emanation['EAP'][token2_index][token1_index]
#     print(f'prob = {prob:.10f}\tprob_emanation = {prob_emanation:.10f}\ttokens = [{token1:<12}, {token2:<12}]')

In [None]:
# result = {}
# for author_initials in authors_initials:
#     result[author_initials] = np.float64(1.0)
    
# for i in range(len(sample_tokens) - 1):     
#     token1, token2 = sample_tokens[i], sample_tokens[i + 1]
#     for author_initials in authors_initials:
#         try:
#             token1_index, token2_index = authors_tokens_map[author_initials][token1], authors_tokens_map[author_initials][token2]
#             prob = authors_matrix[author_initials][token1_index][token2_index]
#             if prob == 0:
#                 prob = 1e-10
#             result[author_initials] *= prob    
#         except:
#             result[author_initials] *= 1e-10
# result

In [None]:
# for key, value in result.items():
#     print(f'{key} {np.log10(value)}')

In [None]:
# def softmax(x):
#     return np.exp(x)/np.sum(np.exp(x), axis=0)

In [None]:
# answer = softmax([np.log10(value) for value in result.values()])
# np.around(answer, decimals=6)

# Check format output

In [None]:
# df_submission = pd.read_csv('../input/sample_submission.csv')
# df_submission.head()

# Read test dataset

In [None]:
# df_test = pd.read_csv('../input/test.csv')
# df_test.head()

# Setup for output

In [None]:
# ids, texts = df_test.loc[:, 'id'], df_test.loc[:,  'text']

In [None]:
# result = {}
# for author_initials in authors_initials:
#     result[author_initials] = []

# Process output

In [None]:
# beta = np.float64(1e-3)
# for id, text in zip(ids, texts):
#     tokens = [word.lower() for word in word_tokenize(text) if word.isalpha()]
    
#     row_result = {}
#     for author_initials in authors_initials:
#         row_result[author_initials] = np.float64(1.0)
        
#     for author_initials in authors_initials:
#         for i in range(len(tokens) - 1):   
#             token1, token2 = tokens[i], tokens[i + 1]
#             try:
#                 token1_index, token2_index = authors_tokens_map[author_initials][token1], authors_tokens_map[author_initials][token2]
#                 prob = authors_matrix[author_initials][token1_index][token2_index]
#                 if prob <= 0:
#                     prob = beta
#             except:
#                 prob = beta
#             row_result[author_initials] *= np.float64(prob)
    
#     answer = softmax([np.log10(value + 1e-320, dtype=np.float64) for value in row_result.values()])
#     for author_initials, prob in zip(authors_initials, answer):          
#         result[author_initials].append(prob)

# Generate output

In [None]:
# output = []
# for id, p1, p2, p3 in zip(ids, result['EAP'], result['HPL'], result['MWS']):
#     output.append([id, p1, p2, p3])

In [None]:
# df_output = pd.DataFrame(output, columns=['id', 'EAP', 'HPL', 'MWS'])
# df_output.head()

In [None]:
# df_output.to_csv('test.csv', index=False)

In [None]:
# df_temp = pd.read_csv('test.csv')
# df_temp

In [None]:
# df_temp.isnull().any().any()

In [None]:
# np.savetxt("eap.csv", authors_matrix_emanation['MWS'], delimiter=",")