In [None]:
# Start writing code here...
import numpy as np 
import pandas as pd
import nltk

pd.options.mode.chained_assignment = None 

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
import os, re
import json
import matplotlib.pyplot as plt

import nltk
from nltk.probability import FreqDist
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
  
import seaborn as sns
sns.set_style('whitegrid')
    
from IPython.display import display, clear_output
    
from tqdm import tqdm
import string

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/')

tqdm.pandas()

In [None]:
TRAIN_SPLIT = 0.9
SENTENCES_TO_EXTRACT = 10
DROP_FRAC = 0.8

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
create_text_ds = True

try:
    CSV_PATH = "../input/colerigde-processed-text/text.csv"
    pd.read_csv(CSV_PATH)
    create_text_ds = False
except:
    print("No text dataset")
    CSV_PATH = 'text.csv'

In [None]:
train_df.head()

In [None]:
grouped_df = pd.DataFrame(train_df['dataset_label'].value_counts())
grouped_df = grouped_df.reset_index()
grouped_df = grouped_df.sort_values(by="dataset_label", ascending=False)
grouped_df.head()

In [None]:
grouped_df = grouped_df.iloc[0:25]

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

ax.bar(grouped_df['index'].values, grouped_df['dataset_label'].values)
ax.set_xticklabels(labels=grouped_df['index'].values,rotation=90);
ax.set_ylabel("Number of occurances")
ax.set_xlabel("Dataset name")

plt.plot()

In [None]:
train_df.info()

In [None]:
sample_sub.head()

In [None]:
print("Nr. of unique labels: {}".format(len(train_df['cleaned_label'].unique())))

In [None]:
# Check for missing values
train_df.isna().sum()

In [None]:
nr_train_csv_ids = train_df['Id'].count()
nr_unique_train_csv_ids = train_df['Id'].nunique()
nr_train_files = len([name for name in os.listdir(train_files_path) if os.path.isfile(os.path.join(train_files_path, name))])

# Confirm that we have as many rows in the train.csv file as that we have files in the train.csv folder?
print("Data in train.csv file: {}".format(nr_train_csv_ids))
print("\n")
print("Unique data in train.csv file: {}".format(nr_unique_train_csv_ids))
print("Data in train folders: {}".format(nr_train_files))
print("\n")
print("Avg. nr. of sources per publication: {}".format(nr_train_csv_ids/nr_unique_train_csv_ids))

assert nr_unique_train_csv_ids == nr_train_files

In [None]:
stopwords = []
with open("../input/stopwords/stopwords.txt", "r") as fd:
    stopwords = fd.read().splitlines()

print(stopwords)

In [None]:
words =list(train_df.values)
split_words=[]
for word in words:
    lo_w=[]
    list_of_words=str(word).split()
    for w in list_of_words:
        if w not in stopwords:
            lo_w.append(w)
    split_words.append(lo_w)
allwords = []
for wordlist in split_words:
    allwords += wordlist

In [None]:
mostcommon = FreqDist(allwords).most_common(100)
wordcloud = WordCloud(width=1000, height=500, background_color='white', stopwords=STOPWORDS).generate(str(mostcommon))
fig = plt.figure(figsize=(15,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Top 100 Most Common Words in label')
plt.tight_layout(pad=0)
plt.show()

mostcommon_small = FreqDist(allwords).most_common(25)
x, y = zip(*mostcommon_small)
plt.figure(figsize=(15,5))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('Words')
plt.ylabel('Frequency of Words')
plt.xticks(rotation=60)
plt.tight_layout(pad=0)
plt.title('Freq of 25 Most Common Words in label')
plt.show()

In [None]:
train_df.head()

## Retrieve document from random publication Id

In [None]:
pub_id = np.random.choice(train_df['Id'].unique())
pub_title = train_df[train_df["Id"] == pub_id].pub_title
pub_df = pd.read_json(train_files_path+"/"+str(pub_id)+".json")

print("Random publication ID: {}".format(pub_id))
print("Random publication name: {}".format(pub_title))
print("\n")

pub_df.head()

In [None]:
train_df[train_df["Id"] == pub_id]

In [None]:
pub_dataset_titles = train_df[train_df["Id"] == pub_id].dataset_title.values

print("Nr. of sections in this publication: {}".format(pub_df["section_title"].count()))
print("Datasets in this publication: {}".format(pub_dataset_titles))

In [None]:
# Find sentence(s) containing dataset name
pub_df['lowercase_text'] = pub_df["text"].str.lower()

pub_df.head()

In [None]:
clean_datasets = np.array([])

for dataset in pub_dataset_titles:
    clean_datasets = np.append(clean_datasets, dataset.lower())

print(clean_datasets)

In [None]:
def extract_ds_sentence(text, dataset_title):
    escaped_ds_title = re.escape(dataset_title)
    return re.findall(r"([^.]*?"+re.escape(escaped_ds_title)+"[^.]*\.)",text)

## Cleaning the data

In [None]:
all_IDs = train_df['Id'].unique()

print(type(all_IDs))
all_IDs_test = all_IDs[:5000]
all_IDs_test[:10]

In [None]:
def slice_sentence(pub_id): 
    final_df = pd.DataFrame()
    path_to_pub = os.path.join(train_files_path, (pub_id+'.json'))

    sentences_col = np.array([])
    labels_col = np.array([])
    sections_col = np.array([])

    with open(path_to_pub, 'r') as f:  
        pub_df = pd.read_json(f)

    for index,row in pub_df.iterrows():
        sentences_in_section = re.findall(r"[A-Z].*?[\.!?]\s?(?=[A-Z]|$)",row["text"])
    
        for sentence in sentences_in_section:
            for label in train_df[train_df['Id'] == pub_id].dataset_label:
                if label in sentence:
                    sentences_col = np.append(sentences_col,sentence)
                    labels_col = np.append(labels_col,label)
                    sections_col = np.append(sections_col,row["section_title"])
    
    final_df["Section"] = sections_col
    final_df["Text"] = sentences_col
    final_df["Label"] = labels_col
    final_df["Id"] = pub_id
    
    return final_df

In [None]:
def slice_section(pub_id): 
    final_df = pd.DataFrame()
    path_to_pub = os.path.join(train_files_path, (pub_id+'.json'))

    text_col = np.array([])
    labels_col = np.array([])
    sections_col = np.array([])

    with open(path_to_pub, 'r') as f:  
        pub_df = pd.read_json(f)

    for index,row in pub_df.iterrows():
        for label in train_df[train_df['Id'] == pub_id].dataset_label:
            if label in row['text']:
                labels_col = np.append(labels_col,label)
                sections_col = np.append(sections_col,row["section_title"])
                text_col = np.append(text_col,row["text"])
            
    
    final_df["Section"] = sections_col
    final_df["Text"] = text_col
    final_df["Label"] = labels_col
    final_df["Id"] = pub_id
    
    return final_df

In [None]:
slice_section(all_IDs[13])

In [None]:
print(slice_section(all_IDs[13])['Text'].values[0])
print(slice_section(all_IDs[13])['Label'].values[0])

In [None]:
slice_section("83c59eeb-d015-4c4e-8d36-cb2cc6dadbe2")

In [None]:
total_contents = np.array([])

In [None]:
if create_text_ds:
    # Remove file if already exists
    if os.path.exists(CSV_PATH):
        os.remove(CSV_PATH)

    print("Starting text slicing")

    for i,filename in enumerate(tqdm(all_IDs, desc='Slicing text')):  
        file_df = slice_section(filename)

        if i==0: file_df.to_csv(CSV_PATH, mode='a', index=False, header=True)
        else: file_df.to_csv(CSV_PATH, mode='a', index=False, header=False)

        # comment when using all data
#         if i==500: break

In [None]:
clean_df = pd.read_csv(CSV_PATH)

print(clean_df.shape)
clean_df.head(10)

In [None]:
print(clean_df['Text'].values[0])
print(clean_df['Label'].values[0])

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
# def find_word_in_text(text, word):
#     matches = []
#     if text.find(word) != -1: # When substring is found
#         pattern = re.compile(r"\b(\w*"+re.escape(word)+r"\w*)\b")
        
#         for match in pattern.finditer(text):
#             matches.append((match.start(),match.end()))
            
                           
#     return matches

In [None]:
word = "national education longitudinal study" 
text = "this study used data from. the national education longitudinal studys nels 88 to examine the effects of dual enrollment. programs for high school students on college degree attainment the study also reported whether the impacts of dual enrollment programs were different for first generation college students versus students whose parents had attended at. least national education longitudinal study some college in addition a supplemental analysis reports on the impact of different amounts of dual enrollment course taking and college degree attainment dual enrollment programs offer college level learning experiences for high school students the programs offer college courses and or the opportunity to earn college credits for students while still in high school the intervention group in the study was comprised of nels participants who attended a postsecondary school and who participated in a dual enrollment. program while in high school n 880 the study. author used propensity score mat"

In [None]:
def cut_sentences(text, word, nr_of_sentences):
    full_matches = []
    full_texts = []
    
    tokens = nltk.sent_tokenize(text)

    for i,t in enumerate(tokens):
        matches = []
        nr = nr_of_sentences
        start_sentence = i
        end_sentence = i
        if t.find(word) != -1: # When substring is found               
            start = True
            end = False

            while nr > -1:
                if start:
                    if start_sentence != 0:
                        start_sentence = start_sentence - 1

                    start = False
                    end = True
                    
                if end:
                    if end_sentence != len(tokens):
                        end_sentence = end_sentence + 1
                        
                    start = True
                    end = False
            
                nr = nr - 1
                
            text = tokens[start_sentence:end_sentence]

            text = ' '.join(text)
        
            pattern = re.compile(r"\b(\S*"+re.escape(word)+r"\S*)")
    
            for match in pattern.finditer(text):
                matches.append((match.start(),match.end(),"DATASET"))
                
            full_matches.append(matches)
            full_texts.append(text)
                        
    return {"texts": full_texts, "matches":full_matches}

In [None]:
example = cut_sentences(text,word,5)

for text in example['texts']:
    print(text)
    
for match in example['matches']:
    print(match)

In [None]:
i = 0

train_df = pd.DataFrame(columns=clean_df.columns)

for idxs, row in tqdm(clean_df.iterrows()):
    temp_df = pd.DataFrame(columns=clean_df.columns)
    
    cuts = cut_sentences(row['Text'], row['Label'], SENTENCES_TO_EXTRACT)
    
    temp_df['Text'] = cuts['texts']
    temp_df['dataset_index'] = cuts['matches']
    
    temp_df['Id'] = row['Id']
    temp_df['Section'] = row['Section']
    temp_df['Label'] = row['Label']
    
    train_df = train_df.append(temp_df)
    
#     if i == 1:
#         break;
        
#     i = i+1

In [None]:
train_df.head()

In [None]:
# train_df = train_df.explode('dataset_index')
# train_df = train_df.drop_duplicates()
train_df = train_df.dropna()

In [None]:
train_df = train_df.reset_index(drop=True)
print(train_df.shape)

train_df.head()

In [None]:
df_group = pd.DataFrame(train_df['Label'].value_counts())
df_group = df_group.reset_index()
df_group = df_group.sort_values(by="Label", ascending=False)

df_group.head()

In [None]:
print("Dataset with highest number of labels: {}".format(df_group.values[0][0]))

In [None]:
print("Removing {} labels".format(df_group.values[0][1] * DROP_FRAC))

In [None]:
# Remove some of the highest occuring values to prevent memoryerror during training
indices = train_df.index[train_df['Label'] == df_group.values[0][0]]

train_df.drop(indices[:int(len(indices) * DROP_FRAC)], inplace=True)

In [None]:
# Remove sentences that are too large for the model to train
mean = np.array([])
count = 0
mean_limit = 100

for idx, rows in train_df.iterrows():
    mean = np.append(mean, len(rows['Text'].split()))
    
    if len(rows['Text'].split()) > (mean.mean() * mean_limit):
        print("Removing row with length {} using limit {}".format(len(rows['Text'].split()), (mean.mean() * mean_limit)))
        train_df.drop(idx, inplace=True)
        count = count + 1

print("Removed {} rows".format(count))

In [None]:
pattern = re.compile(r"\b(\w*"+re.escape(word)+r"\w*)\b")

for match in pattern.finditer(text):
    print(match.start())
    print(match.end())

In [None]:
missing_indexes = 0

for idxs, rows in train_df.iterrows():
    for i in rows['dataset_index']:
        if i == (0, 0, "DATASET"):
            missing_indexes = missing_indexes + 1
            
print("Found "+str(missing_indexes)+" empty indexes")

In [None]:
df = train_df.groupby('Label')['dataset_index'].count().to_frame()
df = df[df['dataset_index'] == 1]

print("Found {} single dataset labels".format(len(df['dataset_index'].values)))

single_labels = df.index.values

df.head()

In [None]:
def duplicate_single_labels(row):
    if row['Label'] in single_labels:
        row2 = row.copy()
        # make edits to row2
        return pd.concat([row, row2], axis=1)
    return row

In [None]:
train_df = pd.concat([duplicate_single_labels(row) for _, row in train_df.iterrows()], ignore_index=True, axis=1).T
print(train_df.shape)

train_df.head()

In [None]:
train_df['dataset_index'].values[0][0]

In [None]:
# This code checks whether the dataset label is actually present in the text given the generated dataset indices
# If this is not the case, an assertion error will be raised and the text will not further be processed
for idx, row in tqdm(train_df.iterrows()):
    for i in row['dataset_index']:
        assert row['Text'].count(row['Label'],
                                 i[0],
                                 i[1]) != 0
        assert len(row['dataset_index']) != 0

In [None]:
train_df = train_df.sample(frac=1, random_state=42) # Shuffle the dataset
train_df = train_df.reset_index(drop=True)

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
y = train_df['Label'].to_frame()
X = train_df

In [None]:
X.head()

In [None]:
y.head()

In [None]:
train, validation = train_test_split(X, train_size=TRAIN_SPLIT, random_state=42, stratify=y)
print("Train and validation set generated")

In [None]:
train = train.reset_index(drop=True)

print(train.shape)
train.head()

In [None]:
validation = validation.reset_index(drop=True)

print(validation.shape)
validation.head()

In [None]:
grouped_train = pd.DataFrame(train['Label'].value_counts(normalize=True))
grouped_train = grouped_train.reset_index()
grouped_train = grouped_train.sort_values(by="Label", ascending=False)

grouped_train = grouped_train.iloc[0:25]

grouped_train.head()

In [None]:
grouped_val = pd.DataFrame(validation['Label'].value_counts(normalize=True))
grouped_val = grouped_val.reset_index()
grouped_val = grouped_val.sort_values(by="Label", ascending=False)

grouped_val = grouped_val.iloc[0:25]

grouped_val.head(25)

In [None]:
# Numbers of pairs of bars you want
N = 25

# Position of bars on x-axis
ind = np.arange(N)

# Figure size
plt.figure(figsize=(10,5))

# Width of a bar 
width = 0.3       

# Plotting
plt.bar(ind, grouped_train['Label'].values, width, label='Train distribution')
plt.bar(ind + width, grouped_val['Label'].values, width, label='Validation distribution')

plt.xlabel('Dataset name index')
plt.ylabel('Number of occurances (normalized)')

plt.legend(loc='best')
plt.show()

In [None]:
train.to_csv("./train.csv")

In [None]:
validation.to_csv("./validation.csv")