The dataset contains argumentative essays written by U.S students in grades 6-12. The essays were annotated by expert raters for elements commonly found in argumentative writing.

Task: To predict the human annotations. You will first need to segment each essay into discrete rhetorical and argumentative elements (i.e., discourse elements) and then classify as one of 7 "discourse types". 

for the text EDA please refer: https://www.kaggle.com/rachanabisht/evaluatingstudentwriting-complete-text-eda

In [None]:
# import libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from nltk.corpus import stopwords
from tqdm.notebook import tqdm
#import warnings
#warnings.filterwarnings('ignore')

#lib for model building:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
#from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import xgboost, numpy, textblob, string


#important lib for text processing
import os
from wordcloud import WordCloud, STOPWORDS

import nltk
nltk.download(['punkt', 'wordnet'])
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sqlalchemy import create_engine  
pd.set_option('display.max_colwidth', None)

# A. Getting the DATA:


In [None]:
base_path = '/kaggle/input/feedback-prize-2021/'

In [None]:
train_df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')

In [None]:
#A look at the annotated text_csv:
display(train_df.head(3))
display(train_df.shape)

In [None]:
#Lets define the fetures and dependent variables:
train_text = train_df[['discourse_text']]
train_text.shape

In [None]:
# also define the dependent varaible as train_labels:
train_labels = train_df[['discourse_type']]
train_labels.shape

In [None]:
train_label = np.array(train_df.discourse_type)
train_label

# B. Data preprocessing:

In [None]:
# define a 'clean_text' function to process the text:
def clean_text(text, remove_stopwords=True, stem_words=False, lemma=True):
    #text = str(text).lower().split()
    text = str(text)
    text = text.lower().split()
    #remove stop words
    if remove_stopwords:
        stops = stopwords.words("english")
        text = [x for x in text if not x in stops]

    
    
    text = ' '.join(text)
    
    text = re.sub(r"[-()\"#/<>!@&;*:<>{}`'+=~%|.!?,_]", " ", text)
    text = re.sub(r"\]", " ", text)
    text = re.sub(r"\[", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\\", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"  ", " ", text)
    text = re.sub(r"   ", " ", text)
    text = re.sub(r"   ", " ", text)
    text = re.sub(r"0x00", "", text)
    
    
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stem_words = [stemmer.stem(x) for x in text]
        text = " ".join(text)
        
    if lemma:
        text = text.split()
        lem = WordNetLemmatizer()
        lemmatized = [lem.lemmatize(x, "v") for x in text]
        text = " ".join(text)
        
    return text

In [None]:
# apply clean_text function to text :
train_text['cleaned_text'] = train_text.discourse_text.apply(clean_text)

In [None]:
train_text.head(3)

# C. Model 

In [None]:
# Defining X (feature) and y (target variables)
X = train_text['cleaned_text'].values
y =train_label

In [None]:
X

In [None]:
y

In [None]:
# split the dataset into training and validation datasets (default splitsize == 0.25:)
X_train, X_vtest, y_train, y_vtest = train_test_split(X, y)

In [None]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_vtest = encoder.fit_transform(y_vtest)

In [None]:
y_train

In [None]:
# Instantiate vectorizers and classifier
vect = CountVectorizer()
tfidf = TfidfTransformer()
clf = RandomForestClassifier()

In [None]:
# fit transform CountVectorizer:
X_train_counts = vect.fit_transform(X_train)

In [None]:
#fit Transform TF-IDF vectorizer:
X_train_tfidf = tfidf.fit_transform(X_train_counts)

In [None]:
#Train Random Forest classifier:
model = clf.fit(X_train_tfidf, y_train)

In [None]:
# predict on validation data
X_vtest_counts = vect.transform(X_vtest)
X_vtest_tfidf = tfidf.transform(X_vtest_counts)
y_pred = model.predict(X_vtest_tfidf)

In [None]:
# define a function to display the results:
def display_results(y_vtest, y_vpred):
    labels = np.unique(y_vpred)
    confusion_mat = confusion_matrix(y_vtest, y_vpred, labels=labels)
    accuracy = (y_vpred == y_vtest).mean()
    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [None]:
display_results(y_vtest, y_pred)

# Preprocess the TEST data: 
We have the following zip files:

test.zip - folder of individual .txt files, with each file containing the full text of an essay response in the test set

sample_submission.csv - file in the required format for making predictions - note that if you are making multiple predictions for a document, submit multiple rows

credits: https://www.kaggle.com/nehapawar/tfidf-random-forest-classifier/comments

In [None]:
#process the test data into strings which are numbered by indices:
TEST_PATH = base_path + 'test/'

def get_test_text(a_id):
    a_file = f"{TEST_PATH}/{a_id}.txt"
    with open(a_file, "r") as fp:
        txt = fp.read()
    return txt

def create_df_test():
    test_ids = [f[:-4] for f in os.listdir(TEST_PATH)] #Remove the last 4 characters ('.txt') in the filenames such as '0FB0700DAF44.txt'.
    test_data = []
    for test_id in test_ids:
        text = get_test_text(test_id)
        sentences = nltk.sent_tokenize(text)
        id_sentences = []
        idx = 0 
        for sentence in sentences:
            id_sentence = []
            words = sentence.split()
            # I created this heuristic for mapping words in sentences to "word indices"
            # This is not definitive and might have strong drawbacks and problems
            for w in words:
                id_sentence.append(idx)
                idx+=1
            id_sentences.append(id_sentence)
        test_data += list(zip([test_id] * len(sentences), sentences, id_sentences))
    df_test = pd.DataFrame(test_data, columns=['id', 'discourse_text', 'ids'])
    return df_test

In [None]:
df_test = create_df_test()
df_test.head()

In [None]:
df_test['predictionstring'] = df_test['ids'].apply(lambda x: ' '.join([str(i) for i in x]))
df_test.head()

In [None]:
df_test = df_test.drop('ids', axis=1)

In [None]:
df_test.head()

In [None]:
#apply clean_text function for text preprocessing:
df_test['cleaned_text'] = df_test.discourse_text.apply(clean_text)

In [None]:
df_test.head(3)

In [None]:
test = df_test['cleaned_text'].values

In [None]:
test

In [None]:
# apply vectorizers:
test_count = vect.transform(test)


In [None]:
test_tfidf_vec = tfidf.transform(test_count)

In [None]:
# apply the trained classifier to predict on test data
y_final_pred = model.predict(test_tfidf_vec)

# submission:

In [None]:
submission_df = pd.DataFrame()
submission_df['id'] = df_test['id']
submission_df['class'] = y_final_pred# label of y_final_predict
submission_df['predictionstring'] = df_test['predictionstring']

In [None]:
submission_df.head()

In [None]:
submission_df_2 = submission_df.copy()


In [None]:
#reverse the process of LabelEncoder
submission_df_2['class'] = encoder.inverse_transform(submission_df['class'])

In [None]:
submission_df_2.head()

In [None]:
submission_df_2.to_csv("submission.csv", index=False)