In [None]:
!pip install textacy

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import nltk
from nltk.stem import WordNetLemmatizer
import string
from sklearn.model_selection import train_test_split
import re
from textacy.viz.termite import draw_termite_plot
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB# initialize binary relevance multi-label classifier
from sklearn.svm import SVC
import pickle

# Bird View of Data

In [None]:
df_train = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv");

In [None]:
df_train.head()

In [None]:
df_train.nunique()

In [None]:
df_train['dataset_title'].unique()

In [None]:
df_train['dataset_label'].unique()

**Dataset Label and title which are not unique** 

In [None]:
df_train[df_train['dataset_title']!=df_train['dataset_label']].loc[:,['pub_title','dataset_title','dataset_label']]

# Loading JSON Files

In [None]:
def data(filename):
    df_json = pd.read_json("/kaggle/input/coleridgeinitiative-show-us-the-data/train/"+str(filename)+".json")
    text = "".join(row['text'] for _,row in df_json.iterrows())
    return text

In [None]:
df_train['json_text'] = df_train['Id'].apply(lambda x : data(x))

In [None]:
df_train.head()

# Know your Data

In [None]:
df_train['dataset_title'].value_counts().plot( kind='bar', figsize=(15,10))

In [None]:
df_train['dataset_label'].value_counts().plot( kind='bar', figsize=(20,5))

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())
    

In [None]:
vectorizer = TfidfVectorizer( min_df =3, max_df=0.2, max_features=None, 
                    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                    ngram_range=(1, 1), use_idf=1,smooth_idf=1,sublinear_tf=1,
                    stop_words = 'english', preprocessor=clean_text)
vectorizer.fit(df_train['dataset_title'])

In [None]:
def create_tf_matrix(category):
    return vectorizer.transform(df_train[df_train['dataset_title'] == category].json_text.apply(clean_text))

def create_term_freq(matrix, cat):
    category_words = matrix.sum(axis=0)
    category_words_freq = [(word, category_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    return pd.DataFrame(list(sorted(category_words_freq, key = lambda x: x[1], reverse=True)),columns=['Terms', cat])

for cat in df_train.dataset_title.unique():
    print("Top 10 terms for: ", cat)
    df_right = create_term_freq(create_tf_matrix(cat), cat).head(5)
    print(df_right)
    print("###############")
    if cat != 'National Education Longitudinal Study':
        df_top5_words = df_top5_words.merge(df_right, how='outer')
    else:
        df_top5_words = df_right.copy()
    print(df_top5_words.shape)

In [None]:
df_top5_words.fillna(0, inplace=True )
df_top5_words.set_index('Terms', inplace=True)
df_top5_words.shape

In [None]:
df = df_top5_words.copy()
df_norm = (df) / (df.max() - df.min())

In [None]:
draw_termite_plot(np.array(df_norm.values),df_top5_words.columns,df_top5_words.index, highlight_cols=[0, 4, 12,20,30,36] )

# Prepare Dataset

In [None]:
Y = pd.get_dummies(df_train['dataset_title'])

In [None]:
vectorizer = TfidfVectorizer( min_df =3, max_df=0.2, max_features=10000, 
                    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                    ngram_range=(1, 1), use_idf=1,smooth_idf=1,sublinear_tf=1,
                    stop_words = 'english', preprocessor=clean_text)
vectorizer.fit(df_train['json_text'])

In [None]:
X = vectorizer.transform(df_train.json_text.apply(clean_text))

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# Create Model

In [None]:

# with a SVC base classifier
classifier = BinaryRelevance(classifier=SVC(), require_dense=[False,True])# train
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print("Accuracy = ",accuracy_score(y_test,predictions))


# Save model

In [None]:
Pkl_Filename = "model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(classifier, file)

# Analyse Model

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

In [None]:
multilabel_confusion_matrix(y_test,predictions)

# Predict Model

In [None]:
df_submission = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv");

In [None]:
def datatest(filename):
    df_json = pd.read_json("/kaggle/input/coleridgeinitiative-show-us-the-data/test/"+str(filename)+".json")
    text = "".join(row['text'] for _,row in df_json.iterrows())
    return text

In [None]:
df_submission['json_text'] = df_submission['Id'].apply(lambda x : datatest(x))

In [None]:
X_test = vectorizer.transform(df_submission.json_text.apply(clean_text))

In [None]:
predictions = classifier.predict(X_test)

In [None]:
print(predictions)

loading the model from pickle just to avoid re run as working just on below part.

In [None]:
Pkl_Filename = "model.pkl"  
with open(Pkl_Filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [None]:
predictions = pickle_model.predict(X_test)

In [None]:
df1 = pd.DataFrame(predictions.toarray(),columns=Y.columns)

In [None]:
def updateSubmission(Id):
    return df1.loc[:,df1.loc[Id] == 1].columns[0]

In [None]:
df_submission['PredictionString'] = df_submission.index.map(lambda x: df1.loc[:,df1.loc[x] == 1].columns[0]) 

In [None]:
df_submission['PredictionString'] = df_submission['PredictionString'].apply(clean_text)

In [None]:
df_submission.drop(columns =['json_text'])

In [None]:
df_submission.to_csv('submission.csv', index=False)

next from dataset title we need to find matching phrases in the json text to get our final output

**Work In progress - please upvote comment if you find anything interesting**