In [None]:
# import the libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [None]:
# load dat and check first few rows
train_data = pd.read_csv("../input/feedback-prize-2021/train.csv")

train_data.head()

In [None]:
# check datatypes and null values etc
train_data.info()

# Preprocessing

In [None]:
'''lets plot the value count of all the classes. 
This help us to understand if there is any class imbalance
and total number of classes''' 

plt.figure(figsize = (12,5))
sns.histplot(train_data['discourse_type'])

In [None]:
'''The number of count of all the labels are very different, therefore we can say that the dataset in higly imbalanced.
Let us find the number of counts of each label
I have defined a helper function to do that'''

def count_target(target_list):
    target_dict = {}
    for x in target_list:
        count = len(train_data[train_data['discourse_type'] == x])
        dict_t = dict({x:count})
        target_dict.update(dict_t)
    return target_dict

target_list = ['Lead', 'Position', 'Evidence','Claim','Concluding Statement','Counterclaim','Rebuttal']
count_target(target_list)

In [None]:
train_data = train_data[['id','discourse_text','discourse_type','predictionstring']]
train_data.head()

1. From the table we can see that it is a classification problem. based on the text context, model has to classify among various classes 'discourse _type'.
2. We need to build a classifier which classify amoung various classes involved.
3. Before that we need to do pre processing where we change all text to lower case and remove the punctuations.
4. We want to proceed initially without removing the stopwords as they may paly important role in identifying the context of sentences.
5. We are going to use tfidf vectorizer to convert the text document into vector.
6. Steps to be involved.
    1. remove punctuations and change to lower case
    2. tokenize the document
    3. find the tfidf vectors
    4. split dataset into train and validation
    5. train the model on train dataset to be able to classify among various calsses using few classification algorithm.
    6. make prediction on validation dataset and check accuracy of model.
    7. preprocess test dataset
    8. make prediction on test dataset.
    
    

In [None]:
import re
import string
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
# let define a helper function to remove the punctuation and change text to lower case
# remove stop words from the string
def cleanup_text(text):
    words = re.sub(pattern = '[^a-zA-Z]',repl = ' ',string = text)
    words = words.lower()
    #words = [word for word in words.split() if word not in stopwords.words('english')]
    #final_sent = ' '.join(words)
    return words   # return final_sent |if stopword removal is required

In [None]:
text_processed = train_data['discourse_text'].apply(cleanup_text)
text_processed

In [None]:
train_data['text_processed'] = text_processed

In [None]:
train_data.head()

In [None]:
# Now we need to change the categorical value of target(class) into numerical
#train_data['discourse_type'] = train_data['discourse_type'].map({'Lead':0,'Position':1, 'Evidence':2, 
#                                                 'Claim':3, 'Concluding Statement':4,
#                                    'Counterclaim':5,'Rebuttal':6 })
#
#train_data.head()

In [None]:
# split dataset into features and target

X_features = train_data['text_processed'] 
Y_target = train_data['discourse_type']
print('feature and target length: ', len(X_features), len(Y_target))

# Now split the dataset into train and validation for training purpose

from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X_features, Y_target,test_size=0.2, random_state=123,shuffle=True)
len(X_train), len(X_val), len(Y_train), len(Y_val)

In [None]:
Y_target

In [None]:
# Lets check if train and validation dataset follows similar disctirbutions

plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
sns.histplot(train_data['discourse_type'],bins=15)
plt.subplot(1,3,2)
plt.title("value counts of train dataset ")
sns.histplot(Y_train,bins=15)
plt.subplot(1,3,3)
sns.histplot(Y_val,bins=15)

All three, the original dataset, train and validation have similar distribution, so we are good to go ahead for training

In [None]:
# Now we need to vecorize the string dataset to use it for training and validation 
# we use tfidf method of vectorization to find the bag of words
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfidf = TfidfVectorizer(max_features=None)

bagofwords = tfidf.fit(X_train)

bagofwords.vocabulary_

In [None]:
print(len(bagofwords.vocabulary_))   # length of bag of words
# bag of words transformed
bagofwords_tfm = bagofwords.transform(X_train)
print(bagofwords_tfm.shape)  # shape of final matrix after including all the documents of train dataset
bagofwords_tfm_val = tfidf.transform(X_val)
print(bagofwords_tfm_val.shape) # shape of final matrix after including all the documents of validation dataset

In [None]:
print(bagofwords_tfm)

# Build Classification model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [None]:
# Lets generalized the model by defining a helper function 'many_model' for few classification algorithms
import time
import datetime
def many_model(X,Y):
    models = {}
    training_time = {}
    
    #classifier = LogisticRegression(max_iter=1000)
    #classifier = BernoulliNB()
    #classifier = MultinomialNB()
    classifier = RandomForestClassifier(n_estimators = 100)
    start_time = time.time()
    classifier.fit(X,Y)
    elapsed_time = datetime.timedelta(time.time() - start_time)
    models.update({'model': classifier})
    training_time.update({classifier: elapsed_time.seconds})
    
    return models, training_time
     

In [None]:
# evaluate model
def evaluate_models(models, X_val, Y_val):
    #scores_list = []
    scores = {}
    #class_wise_acc = {}
    for model in models:
        print(model)
        prediction = models[model].predict(X_val)
        
        accu = accuracy_score(Y_val, prediction)
        f_score = f1_score(Y_val, prediction,average='macro')
        precision = precision_score(Y_val, prediction,average='macro')
        recall = recall_score(Y_val, prediction,average='macro')
        
        scores.update({model:[accu, f_score, precision, recall]})
        #scores_list.append(scores)
    return scores
        

In [None]:
models, training_time = many_model(bagofwords_tfm,Y_train)

models, training_time

In [None]:
models['model']

In [None]:
evaluate_models(models,bagofwords_tfm_val, Y_val )

# Test data processing and making predition

1. Since we have to make prediction on each sentence.
2. the important factor here are splitting the paragraph into sentences and make prediction on them.
3. Another thing is finding the 'predictionstring' which typically is the number id of each word sequence

In [None]:
# read test files and store in dataframe
test_file_path = "../input/feedback-prize-2021/test"


In [None]:
# Getting the dataframe of test data from test files

test_ids = [ids[:-4] for ids in os.listdir(test_file_path)]  
test_ids

test_data = []
for test_id in test_ids:
    #print(test_id)
    path = test_file_path+'/' + test_id +'.txt'
    #print(path)
    sentence_id = []
    initial_id = 0
    with open(path, 'r') as txt: 
        text = txt.read()
        sentences = nltk.sent_tokenize(text)

        for sentence in sentences:
            word_id = []
            words = sentence.split()
            for word in words:
                word_id.append(initial_id)
                initial_id+=1
            sentence_id.append(word_id)
            
    test_data += list(zip([test_id]*len(sentences), sentences, sentence_id))
    
test_df = pd.DataFrame(test_data, columns=['id', 'Discourse_text', 'prediction_str'])

In [None]:
test_df.head()

In [None]:
# make predictionstring in the same format of train dataset
test_df['predictionstring'] = test_df['prediction_str'].apply(lambda x: " ".join([str(i) for i in x]))
test_df.drop('prediction_str',axis = 1, inplace=True)
test_df.head()

In [None]:
# We preprocess test datset
# we can use helper funtion cleanup_text(text) to clean the test datset and clened text is added as a column of test dataset
test_processed = test_df['Discourse_text'].apply(cleanup_text)
test_processed
test_df['text_processed'] =  test_processed
test_df.head()

In [None]:
# choose the test feature for transforming into vector and making prediction
test_feature = test_df['text_processed']
bagofwords_tfm_test = tfidf.transform(test_feature)

In [None]:
#for model in models:
test_predictions = models['model'].predict(bagofwords_tfm_test)

# after prediction is made, it is converted into dataframe and the numerical class is changed into categorical as it was earlier
#test_predictions= pd.DataFrame(test_predictions,columns=['prediction'])
#test_predictions['prediction'] = test_predictions['prediction'].map({0:'Lead',1:'Position', 2:'Evidence', 
#                                             3:'Claim', 4:'Concluding Statement',
#                                    5:'Counterclaim',6:'Rebuttal' }) 
test_predictions

In [None]:
test_pred= pd.DataFrame(test_predictions,columns=['prediction'])
test_pred[0:20]

In [None]:
# form the dataframe including id, class and prediction string
#test_df['prediction'] = pd.DataFrame([test_predictions]) # added as a column to test_df 
#test_df.drop(['text_processed','Discourse_text'],axis=1, inplace=True) # unrequired columns are removed
#test_df.head()

In [None]:
# prepare the submisiion file in the same format as given
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['class'] = test_pred['prediction']
submission['predictionstring'] = test_df['predictionstring']

submission

In [None]:
# save to submission.csv to be uploaded as the prediction result.
submission.to_csv('submission.csv',index=False)