In [None]:
!pip3 install ../input/datasketch/datasketch-1.5.3-py2.py3-none-any.whl

## Import libraries

In [None]:
import os
import re
import json
import gc
from itertools import repeat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from datasketch import MinHash, MinHashLSHForest

## Helper functions

Below clean_text function should be used to clean text as mentioned on the Evaluation page

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
source_path = '../input/coleridgeinitiative-show-us-the-data'

In [None]:
train_df = pd.read_csv(f'{source_path}/train.csv')
sample_submission_df = pd.read_csv(f'{source_path}/sample_submission.csv')

In [None]:
train_df.info()

In [None]:
train_df.head()

Cleaning publication title

In [None]:
train_df['clean_pub_title'] = train_df.pub_title.apply(lambda x: clean_text(x))

In [None]:
train_df.head()

## WordCloud

In [None]:
pub_title_data = ' '.join(i for i in train_df['clean_pub_title'])

In [None]:
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(pub_title_data)
plt.figure(figsize=(40, 30))
plt.imshow(wordcloud)  
plt.axis("off")

#

In [None]:
vectorizer = CountVectorizer(stop_words='english')
count_m = vectorizer.fit_transform(train_df['clean_pub_title'])

In [None]:
count_df = pd.DataFrame({'tokens': vectorizer.get_feature_names(), 'count': count_m.toarray().sum(axis=0).tolist()})
count_df.sort_values(by='count', ascending=True, inplace=True)

In [None]:
count_df.head(10)

In [None]:
count_df.tail(10)

Top 50 tokens with less frequency within publication title

In [None]:
plt.figure(figsize = (15, 15))
sns.pointplot(x=count_df['tokens'][:50], y=count_df['count'][:50], linestyles="-")
plt.xlabel("tokens")
plt.ylabel("frequency")
plt.xticks(rotation=90)
plt.show()

Top 50 tokens with high frequency within publication title

In [None]:
plt.figure(figsize = (15, 15))
sns.pointplot(x=count_df['tokens'][-50:], y=count_df['count'][-50:], color = "green", linestyles="-")
plt.xlabel("tokens")
plt.ylabel("frequency")
plt.xticks(rotation=90)
plt.show()

Calculating length of the publication title

In [None]:
train_df['pub_title_len'] = train_df.clean_pub_title.apply(lambda x: len(x))

Publication title with longest length

In [None]:
print(f'Length: {len(train_df.iloc[train_df.pub_title_len.argmax()].clean_pub_title)}')
print(f'Publication title: {train_df.iloc[train_df.pub_title_len.argmax()].clean_pub_title}')

In [None]:
temp_df = train_df[['pub_title_len', 'clean_pub_title']].copy()
temp_df.sort_values(by='pub_title_len', ascending=True, inplace=True)

Top 50 publication title with short length

In [None]:
plt.figure(figsize = (15, 15))
sns.pointplot(x=temp_df.index[:50], y=temp_df['pub_title_len'][:50], linestyles="-")
plt.xlabel("row index")
plt.ylabel("title length")
plt.xticks(rotation=90)
plt.show()

Top 50 publication title with longest length

In [None]:
plt.figure(figsize = (15, 15))
sns.pointplot(x=temp_df.index[-50:], y=temp_df['pub_title_len'][-50:], linestyles="-")
plt.xlabel("row index")
plt.ylabel("title length")
plt.xticks(rotation=90)
plt.show()

## Merge JSON files

### Train - JSON to DataFrame

In [None]:
train_json_source = f'{source_path}/train'
test_json_source = f'{source_path}/test'

In [None]:
%%time
temp_json_list = []
temp_file_name = []
temp_label = []
for json_data in os.listdir(train_json_source):
    temp_json_list.append(pd.read_json(f'{train_json_source}/{json_data}', orient='records'))
    with open(f'{train_json_source}/{json_data}', 'r') as f:
        temp_data = json.load(f)
        temp_file_name.extend(repeat(json_data.replace('.json', ''), len(temp_data)))
        temp_label.extend(repeat(train_df.loc[train_df['Id'] == json_data.replace('.json', '')].cleaned_label.to_list()[0], len(temp_data)))

In [None]:
train_json = pd.concat(temp_json_list, ignore_index=True)
train_json['file'] = temp_file_name
train_json['cleaned_label'] = temp_label
train_json.shape

In [None]:
del temp_json_list, temp_file_name
gc.collect()

In [None]:
train_groups = train_json.groupby(train_json['file'])
train_groups = train_groups.apply(lambda train_json: train_json.sort_values(by=['file']))
train_groups.drop(['file'], inplace=True, axis=1)
train_groups.to_csv('train_json.csv')

In [None]:
train_groups

In [None]:
del train_groups

### Test - JSON to DataFrame

In [None]:
%%time
temp_json_list = []
temp_file_name = []
temp_label = []
for json_data in os.listdir(test_json_source):
    temp_json_list.append(pd.read_json(f'{test_json_source}/{json_data}', orient='records'))
    with open(f'{test_json_source}/{json_data}', 'r') as f:
        temp_data = json.load(f)
        temp_file_name.extend(repeat(json_data.replace('.json', ''), len(temp_data)))
        temp_label.extend(repeat(train_df.loc[train_df['Id'] == json_data.replace('.json', '')].cleaned_label.to_list()[0], len(temp_data)))

In [None]:
len(temp_label), len(temp_file_name)

In [None]:
test_json = pd.concat(temp_json_list, ignore_index=True)
test_json['file'] = temp_file_name
test_json['cleaned_label'] = temp_label
test_json.shape

In [None]:
del temp_json_list, temp_file_name
gc.collect()

In [None]:
test_json.head()

In [None]:
test_groups = test_json.groupby(test_json['file'])
test_groups = test_groups.apply(lambda test_json: test_json.sort_values(by=['file']))
test_groups.drop(['file'], inplace=True, axis=1)
test_groups.to_csv('test_json.csv')

In [None]:
test_groups

In [None]:
del test_groups

## Submission

In [None]:
train_json['clean_text'] = train_json.text.apply(lambda x: clean_text(x))

### LSH

* Setting number of permutations
* Setting number of recommendations to return  
* Setting depth of LSH Forest
* Preparing shingles
* MinHashing all the shingles
* Preparing MinHashForest of MinHash
* Indexing forest
* Querying forest
* Calculating jaccard similarity or cosine similarity (Post-processing)

Reference: http://ekzhu.com/datasketch/

In [None]:
class LSH:
    def __init__(self, permutations, number_of_recommendations, depth, dataframe):
        self.permutations = permutations
        self.number_of_recommendations = number_of_recommendations
        self.depth = depth
        self.dataframe = dataframe
        self.minhash = []
        self.forest = None
    
    def minhash_data(self):
        for title in self.dataframe['clean_text']:
            tokens = title.split(' ')
            min_hash = MinHash(num_perm=self.permutations)
            for t in tokens:
                min_hash.update(t.encode('utf-8'))
            self.minhash.append(min_hash)
    
    def prepare_forest(self):
        self.forest = MinHashLSHForest(num_perm=self.permutations, l=self.depth)
        for i, j in enumerate(self.minhash):
            self.forest.add(i, j)
        self.forest.index()
        del self.minhash
        gc.collect()
    
    def query_forest(self, query, number_of_results, cosine_sim=False):
        query_tokens = query.split(' ')
        min_hash = MinHash(num_perm=self.permutations)
        for i in query_tokens:
            min_hash.update(i.encode('utf-8'))
        result = self.forest.query(min_hash, self.number_of_recommendations)
        if cosine_sim:
            # print("Cosine Similarity")
            result = [(key, self.cosine_similarity(self.dataframe.iloc[key].cleaned_label, query)) for key in result]
        else:
            # print("Jaccard Similarity")
            result = [(key, self.jaccard_similarity(self.dataframe.iloc[key].cleaned_label, query_tokens)) for key in result]
        result = sorted(result, key=lambda x: x[1], reverse=True)[:number_of_results]
        iloc = [i[0] for i in result]
        return '|'.join(set(self.dataframe.iloc[iloc].cleaned_label.to_list()))
    
    def jaccard_similarity(self, l1, l2):
        intersection = len(list(set(l1).intersection(l2)))
        union = (len(l1) + len(l2)) - intersection
        return float(intersection) / union
    
    def cosine_similarity(self, string1, string2):
        d1 = nlp(string1)
        d2 = nlp(string2)
        return d1.similarity(d2)
    

In [None]:
obj = LSH(permutations=128, number_of_recommendations=20, depth=10, dataframe=train_json)

In [None]:
%%time
obj.minhash_data()

In [None]:
%%time
obj.prepare_forest()

In [None]:
%%time
sample_submission_df['PredictionString'] = sample_submission_df.Id.apply(lambda x: obj.query_forest(train_json.loc[train_json['file'] == str(x)].cleaned_label.to_list()[0], 10))

In [None]:
sample_submission_df.to_csv('submission.csv', index=False)
sample_submission_df

<h3 align="center" style="background-color:#003300;color:white;">Thanks! More updates to come. WIP</h3> 