In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import math, gc, re, warnings
from nltk.tokenize import sent_tokenize
from wordcloud import WordCloud, STOPWORDS
from tqdm.auto import tqdm
from os import listdir
from os.path import isfile, join
warnings.filterwarnings("ignore")

# EDA

In [None]:
df = pd.DataFrame(pd.read_csv('../input/feedback-prize-2021/train.csv'))
df

In [None]:
df['sentence_num'] = df.groupby('id').cumcount() + 1
df['total_sentences'] = df.groupby('id')['sentence_num'].transform('max')
df['sentence_location'] = round(df.sentence_num/df.total_sentences, 2) * 10
df['words_num'] = df['discourse_text'].str.split(' ').str.len()
df['char_num'] = df['discourse_text'].str.len()

df = df.rename(columns={'discourse_type':'class',
                        'discourse_text':'text',
                        'discourse_start':'start_loc',
                        'discourse_end':'end_loc'})

df.head()

### Discourse by the numbers

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))
# discourse type breakdown
df['class'].value_counts().plot.bar(title='class', ax=ax1)
df['discourse_type_num'].value_counts().plot.bar(title='discourse_type_num', ax=ax2)
plt.tight_layout()

In [None]:
df.groupby('id')['total_sentences'].first().plot.hist(grid=True, title='Total sentences frequency')
plt.tight_layout()

In [None]:
# discourse start/end frequencies 
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,3))
df.start_loc.plot.hist(grid=True, bins=20, title='start location', ax=ax1)
df.end_loc.plot.hist(grid=True, bins=20, title='end location', ax=ax2)
plt.tight_layout()

In [None]:
# length of discourse text
fig, ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(4,2, figsize=(15,10))

types = df['class'].unique().tolist()
axs = [ax1,ax2,ax3,ax4,ax5,ax6,ax7]

def plot_length(discourse):
    df.loc[df['class']==discourse]['text'].str.split(' ').str.len()\
        .plot.hist(grid=True, bins=20, title=f'{discourse} text length frequency: number of words', ax=axs[types.index(discourse)])
    
for x in types:
    plot_length(x)

plt.tight_layout()

In [None]:
# length of discourse text
fig, ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(4,2, figsize=(15,10))

axs = [ax1,ax2,ax3,ax4,ax5,ax6,ax7]

def sentence_number(discourse):
    df.loc[df['class']==discourse]['text'].str.len()\
        .plot.hist(grid=True, bins=20, title=f'{discourse} character number frequency', ax=axs[types.index(discourse)])
    
for x in types:
    sentence_number(x)

plt.tight_layout()

In [None]:
# length of discourse text
fig, ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(4,2, figsize=(15,10))

axs = [ax1,ax2,ax3,ax4,ax5,ax6,ax7]

def plot_length(discourse):
    df.loc[df['class']==discourse]['sentence_num']\
        .plot.hist(grid=True, bins=20, title=f'{discourse} sentence number frequency', ax=axs[types.index(discourse)])
    
for x in types:
    plot_length(x)

plt.tight_layout()

In [None]:
# length of discourse text
fig, ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(4,2, figsize=(15,10))

axs = [ax1,ax2,ax3,ax4,ax5,ax6,ax7]

def sentence_number(discourse):
    df.loc[df['class']==discourse]['sentence_location']\
        .plot.hist(grid=True, bins=20, title=f'{discourse} essay location frequency', ax=axs[types.index(discourse)])
    
for x in types:
    sentence_number(x)

plt.tight_layout()

### Word clouds

In [None]:
words = ''
stopwords = set(STOPWORDS)

# iterate through the df
for val in df['text']:

    val = str(val)

    tokens = val.split()

    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()

    words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(words)

# plot word cloud                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.title(f'Text word cloud')
plt.tight_layout(pad = 0)

plt.show()

In [None]:
# add stopwords based on word cloud
stop_words = ['student','students','school','schools','people','teacher','teachers'] + list(STOPWORDS)

In [None]:
def wc(discourse):
    words = ''
    stopwords = set(stop_words)

    # iterate through the df
    for val in df.loc[df['class']==discourse]['text']:

        val = str(val)

        tokens = val.split()

        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        words += " ".join(tokens)+" "

    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = stopwords,
                    min_font_size = 10).generate(words)

    # plot word cloud                       
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.title(f'{discourse} word cloud')
    plt.tight_layout(pad = 0)

    plt.show()
    
wc('Claim')

In [None]:
wc('Evidence')

In [None]:
wc('Position')

In [None]:
wc('Concluding Statement')

In [None]:
wc('Lead')

In [None]:
wc('Counterclaim')

In [None]:
wc('Rebuttal')

# Training/testing basic model

In [None]:
# select features/target and split data
features = df['text']
target = df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    random_state=42)


print(X_train.shape,X_test.shape)

In [None]:
# create and test model/pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# from sklearn.linear_model import PassiveAggressiveClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

vectorizer = CountVectorizer(ngram_range=(1,2))
model = SGDClassifier()

pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model)
])

pipe.fit(X_train, y_train)

test_score = pipe.score(X_test, y_test)
pred = pipe.predict(X_test)
f1 = f1_score(y_test, pred, average='micro')

print('test score:', test_score)
print('F1 score:', f1)

PassiveAggressiveClassifier: .59


RandomForestClassifier: .64


SGDClassifier: .69

## Create test set

In [None]:
# get list of file names
test_folder = '../input/feedback-prize-2021/test'
filenames = [f for f in listdir(test_folder) if isfile(join(test_folder, f))]

# create dict for all data
all_data = {'id':[], 'text':[], 'start_loc':[], 'end_loc':[], 'start_word':[], 
            'end_word':[], 'sentence_num':[], 'predictionstrings':[]}

# loop through file names
for i in tqdm(range(len(filenames))):
    
    path = test_folder + '/' + filenames[i] #assemble file path

    with open(path, 'r') as f:
        text = f.read() #read in text
    f.close()

#     split_text = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text) #split text into sentence

    split_text = sent_tokenize(text) #split text into sentence
    
    string_word = 0 #set predictionstring start
    string_loc = 0 #set char start

    # loop through sentences and append data to dict lists
    for sentence in split_text:
        all_data['id'].append(filenames[i][:-4])
        all_data['text'].append(sentence)
        all_data['start_word'].append(string_word)
        
        # calculate string end location and update new starting loc
        word_split = sentence.split(' ')
        all_data['end_word'].append(string_loc+len(word_split)) 
        string_word += len(word_split)
        
        all_data['start_loc'].append(string_loc)
        string_loc += len(sentence) + 1
        all_data['end_loc'].append(string_loc)
        

        
# create testing DF
test_df = pd.DataFrame(data={'id':all_data['id'], 
                             'text':all_data['text'], 
                             'start_loc':all_data['start_loc'], 
                             'end_loc':all_data['end_loc'],
                             'start_word':all_data['start_word'],
                             'end_word':all_data['end_word']})


# calcuate and add predictionstrings column
for index, row in test_df.iterrows():
    all_data['predictionstrings'].append(' '.join([str(i) for i in range(row.start_word, row.end_word + 1)]))

test_df['predictionstring'] = all_data['predictionstrings']

test_df['sentence_num'] = test_df.groupby('id').cumcount() + 1
test_df['total_sentences'] = test_df.groupby('id')['sentence_num'].transform('max')
test_df['sentence_location'] = round(test_df.sentence_num/test_df.total_sentences, 2) * 10
test_df['words_num'] = test_df['text'].str.split(' ').str.len()
test_df['char_num'] = test_df['text'].str.len()

test_df

# Making predictions

In [None]:
# make test preds and save output
preds = pipe.predict(test_df['text'])
test_df['class'] = preds
submit_df = test_df[['id','class','predictionstring']]
# submit_df.to_csv('submission.csv', index=False)
submit_df

**This model scored 0.145**

# Checking predictions 

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(18,4))
submit_df['class'].value_counts().plot.bar(title='Sumbission prediction classes', ax=ax1)
pd.Series(pred).value_counts().plot.bar(title='Split-test prediction classes', ax=ax2)
df['class'].value_counts().plot.bar(title='Training classes', ax=ax3)
plt.tight_layout()

Okay, so too many "claim" predictions in the submission set. Let's look at the words again.

In [None]:
# # training words

# stop_words.append('')
# def bar_charter(classification, ax):
#     all_text = ' '.join(df.loc[df.discourse_type==classification].discourse_text.str.lower().tolist())
#     all_words = [word for word in all_text.split(' ') if word not in stop_words]
#     pd.Series(all_words).value_counts().head(20).plot.bar(title=f'{classification} word frequency', ax=ax)
    
# fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,4))
    
# bar_charter('Claim', ax1)
# bar_charter('Evidence', ax2)

In [None]:
# # submission words

# stop_words.append('')
# def bar_charter(classification, ax):
#     all_test_text = ' '.join(test_df.loc[test_df['class']==classification].text.str.lower().tolist())
#     all_test_words = [word for word in all_test_text.split(' ') if word not in stop_words]
#     pd.Series(all_test_words).value_counts().head(20).plot.bar(title=f'{classification} word frequency', ax=ax)

    
# fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,4))
    
# bar_charter('Claim', ax1)
# bar_charter('Evidence', ax2)

There's a lot of overlap here. "will", "one", "help", and "make" are very present in both the "Claim" and "Evidence" classifications.

# Retesting
From the EDA earlier, it appears some classes appear more frequently in certain locations within a given essay. Let's add some features!

In [None]:
# select features/target and split data
features = df[[
    'start_loc',
    'end_loc',
    'text',
    'sentence_num',
    'total_sentences',
    'sentence_location',
    'words_num',
    'char_num'
]]

target = df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=42)

print(X_train.shape,y_train.shape)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression, PassiveAggressiveClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

# functions to select numeric and text data by column name
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[[
    'start_loc',
    'end_loc',
    'sentence_num',
    'total_sentences',
    'sentence_location',
    'words_num',
    'char_num'
]], validate=False)

scaler = StandardScaler()

vectorizer = CountVectorizer(ngram_range=(1,2))
transformer = TfidfTransformer()
model = SGDClassifier()


# create pipeline to process and join features
pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('scaler', scaler)
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('vectorizer', vectorizer),
            ]))
         ])),
    ('clf', model)
])

# train model
pipe.fit(X_train, y_train)

# test model
pred = pipe.predict(X_test)
f1 = f1_score(y_test, pred, average='micro')

print('F1 score:', f1)

Much better! But how well will it work with the actual test data?

In [None]:
# make test preds and save output
test_features = test_df[[
   'start_loc',
   'end_loc',
   'text',
   'sentence_num',
   'total_sentences',
   'sentence_location',
   'words_num',
   'char_num'
]]

preds = pipe.predict(test_features)
# probs = pipe.predict_proba(test_features)
test_df['class'] = preds
# test_df['predict_prob'] = probs

submit_df = test_df[['id','class','predictionstring']]

# submit_df.to_csv('submission.csv', index=False)
submit_df

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(18,4))
submit_df['class'].value_counts().plot.bar(title='Sumbission prediction classes', ax=ax1)
pd.Series(pred).value_counts().plot.bar(title='Split-test prediction classes', ax=ax2)
df['class'].value_counts().plot.bar(title='Training classes', ax=ax3)
plt.tight_layout()

Looks a little better, though the model is clearly classifying "Position" sentences as "Rebuttal".


**This model with added features scored .08 (even worse)**


Something is still causing the test set trouble. 

# Unclassified text?

Could there be segments of the training essays that are not classified? That might be throwing things off.

Let's see if we can find out.

In [None]:
(df.groupby('id')['end_loc'].last()-(df.groupby('id')['char_num'].sum())).head(25)

Looks like there are some unaccounted for characters.

In [None]:
# filenames = df['id'].unique().tolist()

# train_path = '../input/feedback-prize-2021/train/'

# all_text = {}

# for i in tqdm(range(len(filenames))):
    
#     path = train_path + filenames[i] + '.txt'
    
#     with open(path, 'r') as f:
#         text = f.read()
#     f.close()
    
#     all_text[filenames[i]] = text
             
# len(all_text)

In [None]:
# test_dict = all_text

# unused_sentences = []
# ids = []
# for index, row in df.iterrows():
#     text = test_dict[row['id']][int(row['start_loc']):int(row['end_loc'])]
#     test_dict[row['id']] = test_dict[row['id']].replace(text, '*'*len(text))

#     for x in filter(None, test_dict[row['id']].split('*')): 
#         if x not in unused_sentences:
#             if len(x.split(' ')) > 5:
#                 unused_sentences.append(x)
#                 ids.append(row['id'])

# unused_df = pd.DataFrame({'id':ids,'unused_sentences':unused_sentences})
# unused_df

In [None]:
# unused_df = pd.DataFrame({'id':ids, 'text':unused_sentences, 'class':'no_class'})
# unused_df['text'] = unused_df['text'].replace('\s+', ' ', regex=True)
# unused_df = unused_df.loc[unused_df['text'] != ' ']
# unused_df

In [None]:
unused_df = pd.DataFrame(pd.read_csv('../input/text-with-no-class/no_class.csv'))
all_classes_df = df[['id','text','class']].append(unused_df)
# all_classes_df.to_csv('no_class.csv')

In [None]:
# select features/target and split data
features = all_classes_df['text']
target = all_classes_df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    random_state=42)


# create and test model/pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import PassiveAggressiveClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

vectorizer = TfidfVectorizer(ngram_range=(1,2))
model = PassiveAggressiveClassifier()

pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model)
])

pipe.fit(X_train, y_train)

test_score = pipe.score(X_test, y_test)
pred = pipe.predict(X_test)
f1 = f1_score(y_test, pred, average='micro')

print('test score:', test_score)
print('F1 score:', f1)

In [None]:
# make test preds and save output
test_features = test_df['text']

preds = pipe.predict(test_features)
# probs = pipe.predict_proba(test_features)
test_df['class'] = preds
# test_df['predict_prob'] = probs

submit_df = test_df[['id','class','predictionstring']]

# submit_df.to_csv('submission.csv', index=False)
submit_df

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(18,4))
submit_df['class'].value_counts().plot.bar(title='Sumbission prediction classes', ax=ax1)
pd.Series(pred).value_counts().plot.bar(title='Split-test prediction classes', ax=ax2)
df['class'].value_counts().plot.bar(title='Training classes', ax=ax3)
plt.tight_layout()

In [None]:
# submit_df['class'] = np.where(submit_df['class']=='no_class', 'Evidence', submit_df['class'])
submit_df = submit_df.loc[submit_df['class'] != 'no_class']
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(18,4))
submit_df['class'].value_counts().plot.bar(title='Sumbission prediction classes', ax=ax1)
pd.Series(pred).value_counts().plot.bar(title='Split-test prediction classes', ax=ax2)
df['class'].value_counts().plot.bar(title='Training classes', ax=ax3)
plt.tight_layout()

In [None]:
submit_df.to_csv('submission.csv', index=False)