In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import nltk
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from functools import partial
import re
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
import itertools
import collections
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

In [None]:
### Define Paths for Train and Test Json files
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
### Function to read JSON files and extract publication Text 

def json_to_text(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
            
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = ' '.join(combined)
    
    if output=='text':
        return all_contents
    elif output=='head':
        return all_headings
    else:
        return all_data

In [None]:
### Extract Publication Text for Training Data
tqdm.pandas()
train_df['text'] = train_df['Id'].progress_apply(json_to_text)

In [None]:
### Reading the Sample Submission Data

sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
sample_sub.head()
### Extract Publication Text for the sample publications 
sample_sub['text'] = sample_sub['Id'].apply(partial(json_to_text,train_files_path=test_files_path))

In [None]:
def lemmatization(text):

    doc = nlp(text)
    lemma_list = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(lemma_list)

def nltk_lemma(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    return [lemmatizer.lemmatize(word) for word in words]
    #return lemmatizer.lemmatize(text)

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
train_df['text'] = train_df['text'].progress_apply(clean_text)

stop_words = stopwords.words('english')
train_df['text'] = train_df['text'].progress_apply(nltk_lemma)

train_df['text'] = train_df['text'].apply(" ".join)

In [None]:
docs = train_df['text'].tolist()

#Ignore words that appear in 85% texts, 
cv = CountVectorizer(max_df=0.85, stop_words=stop_words, max_features=60000)
word_count_vector = cv.fit_transform(docs)

tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
feature_names = cv.get_feature_names()

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
Ids = train_df.Id.tolist()
keyword_df = pd.DataFrame()

for i in range(len(docs)):
    doc = docs[i]
    Id = Ids[i]
    tfidf_vector = tfidf_transformer.transform(cv.transform([doc]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tfidf_vector.tocoo())

    #extract only the top n; n here is 500
    keywords=extract_topn_from_vector(feature_names,sorted_items,500)

    temp_df = pd.DataFrame()
    temp_df['keyword'] = keywords
    temp_df['weight'] = keywords.values()
    temp_df['id']=Id
    keyword_df = keyword_df.append(temp_df)

### We have a DataFrame with Keywords for each article and its keywords with their weights
keyword_df[['id','keyword','weight']].head()

In [None]:
keyword_df.to_csv('keywords_df.csv')
#keyword_df=pd.read_csv('../input/end-2-end-cosine-similarity/keywords_df.csv')

In [None]:
keyword_grpd = keyword_df.groupby('id')['keyword'].unique().reset_index()
df_train_key = pd.merge(train_df,keyword_grpd, left_on='Id', right_on='id')

df_train_key_grpd = df_train_key.copy()
df_train_key_grpd['keyword'] = [str(x) for x in df_train_key_grpd['keyword']]
df_train_key_grpd = df_train_key_grpd.groupby('cleaned_label')['keyword'].sum().reset_index()

df_train_key_grpd.loc[:,'keyword'] = df_train_key_grpd.keyword.str.replace('[','')
df_train_key_grpd.loc[:,'keyword'] = df_train_key_grpd.keyword.str.replace(']','')
df_train_key_grpd.loc[:,'keyword'] = df_train_key_grpd.keyword.str.replace('\n','')
df_train_key_grpd.loc[:,'keyword'] = df_train_key_grpd['keyword'].str.split(" ").map(set).str.join(" ")

df_train_key_grpd.loc[:,'keyword'] = df_train_key_grpd['keyword'].str.replace("''"," ")
df_train_key_grpd.loc[:,'keyword'] = df_train_key_grpd['keyword'].str.replace("'","") 

df_train_key_grpd.loc[:,'keyword'] = df_train_key_grpd['keyword'].str.split(" ").map(set).str.join(" ")

In [None]:
df_train_key_grpd.to_csv('df_train_key_grpd.csv')

In [None]:
#df_train_key_grpd.rename(columns={'Unnamed: 0':'ix'}, inplace=True)
df_train_key_grpd.reset_index(inplace=True)

tqdm.pandas()
sample_sub['text'] = sample_sub['text'].progress_apply(clean_text)
sample_sub['text'] = sample_sub['text'].progress_apply(nltk_lemma)

sample_sub['text'] = sample_sub['text'].apply(" ".join)

docs = sample_sub['text'].tolist()


Ids = sample_sub.Id.tolist()
keyword_df = pd.DataFrame()

for i in range(len(docs)):
    doc = docs[i]
    Id = Ids[i]
    tfidf_vector = tfidf_transformer.transform(cv.transform([doc]))

    sorted_items=sort_coo(tfidf_vector.tocoo())

    #extract only the top n; n here is 500
    keywords=extract_topn_from_vector(feature_names,sorted_items,500)

    temp_df = pd.DataFrame()
    temp_df['keyword'] = keywords
    temp_df['weight'] = keywords.values()
    temp_df['id']=Id
    keyword_df = keyword_df.append(temp_df)

keyword_df[['id','keyword','weight']].head()

In [None]:
keyword_grpd_test = keyword_df.groupby('id')['keyword'].unique().reset_index()
df_test_key = pd.merge(sample_sub,keyword_grpd_test, left_on='Id', right_on='id')

df_test_key_grpd = df_test_key.copy()
df_test_key_grpd['keyword'] = [str(x) for x in df_test_key_grpd['keyword']]
df_test_key_grpd = df_test_key_grpd.groupby('Id')['keyword'].sum().reset_index()

df_test_key_grpd.loc[:,'keyword'] = df_test_key_grpd.keyword.str.replace('[','')
df_test_key_grpd.loc[:,'keyword'] = df_test_key_grpd.keyword.str.replace(']','')
df_test_key_grpd.loc[:,'keyword'] = df_test_key_grpd.keyword.str.replace('\n','')
df_test_key_grpd.loc[:,'keyword'] = df_test_key_grpd['keyword'].str.split(" ").map(set).str.join(" ")

df_test_key_grpd.loc[:,'keyword'] = df_test_key_grpd['keyword'].str.replace("''"," ")
df_test_key_grpd.loc[:,'keyword'] = df_test_key_grpd['keyword'].str.replace("'","") 

df_test_key_grpd.loc[:,'keyword'] = df_test_key_grpd['keyword'].str.split(" ").map(set).str.join(" ")

In [None]:
tfidfvectoriser = TfidfVectorizer()
tfidfvectoriser.fit(df_train_key_grpd.keyword)
dataset_tfidf_vectors = tfidfvectoriser.transform(df_train_key_grpd.keyword)

test_tfidf_vectors = tfidfvectoriser.transform(df_test_key_grpd.keyword)

In [None]:
pairwise_similarities = np.dot(test_tfidf_vectors, dataset_tfidf_vectors.T).toarray()

In [None]:
### - take top 3 matching dataset
matching_df = pd.DataFrame()

for i,val in enumerate(df_test_key_grpd.Id):
    temp_df = pd.DataFrame()
    sim_index = np.argsort(pairwise_similarities[i])[:126:-1]
    temp_df['similar_dataset'] = sim_index
    temp_df['similarity_score'] = pairwise_similarities[i][sim_index]
    temp_df['Id'] = val
    temp_df['ix'] = i
    matching_df = matching_df.append(temp_df)
    

In [None]:
matched_df_merge = pd.merge(matching_df[['similar_dataset','Id']],
                            df_train_key_grpd[['index','cleaned_label']], left_on='similar_dataset',right_on='index',
                            how='inner')

sub_df = matched_df_merge.groupby('Id')['cleaned_label'].apply('|'.join).reset_index()
sub_df.rename(columns={'cleaned_label':'PredictionString'}, inplace=True)

In [None]:
#filter_matching_df = matching_df[matching_df.similarity_score>=0.08].copy()
#matched_df_merge = pd.merge(filter_matching_df[['similar_dataset','Id','ix']],
#                            df_train_key_grpd[['index','cleaned_label']], left_on='similar_dataset',right_on='index',
#                            how='inner')
#
#sub_df = matched_df_merge.groupby('Id')['cleaned_label'].apply('|'.join).reset_index()
#sub_df_merge = pd.merge(df_test_key_grpd,sub_df, on='Id', how='left')
#sub_df_merge['cleaned_label'].fillna('alzheimer s disease neuroimaging initiative adni|adni', inplace=True)
#sub_df_merge = sub_df_merge[['Id','cleaned_label']]
#sub_df_merge.rename(columns={'cleaned_label':'PredictionString'}, inplace=True)

In [None]:
sub_df.to_csv('submission.csv', index = False)
#sub_df_merge.to_csv('submission.csv', index = False)