In [None]:
# libraries
import os
import numpy as np 
import pandas as pd 
pd.options.mode.chained_assignment = None
from tqdm.notebook import tqdm
tqdm.pandas()
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer

In [None]:
# files
os.listdir("../input/feedback-prize-2021")

In [None]:
submission = pd.read_csv("../input/feedback-prize-2021/sample_submission.csv")
submission.head()

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')
train_df.head()

In [None]:
train_df.discourse_type.value_counts(normalize=False)

In [None]:
train_dir = "../input/feedback-prize-2021/train"
test_dir = "../input/feedback-prize-2021/test"

In [None]:
def create_full_text_dataframe(train=True) -> pd.DataFrame:
    id_list = []
    text_list = []
    
    if train:
        for id in tqdm(train_df.id):
            filepath = os.path.join(train_dir, f"{id}.txt")
            text = open(filepath, 'r').read()

            id_list.append(id)
            text_list.append(text)
    else:
        for filename in tqdm(os.listdir(test_dir)):
            filepath = os.path.join(test_dir, filename)
            id = str(filename).strip()[:-4]
            text = open(filepath, 'r').read()
            
            id_list.append(id)
            text_list.append(text)
            
    return pd.DataFrame(data={"id": id_list, "text": text_list})

In [None]:
df_train = create_full_text_dataframe()

In [None]:
df_test = create_full_text_dataframe(train=False)

In [None]:
df_train = df_train.merge(train_df, on="id", how="inner")

#### Building a baseline model to classify the discourse_type of each of the sentences (discourse_text) in the complete text.

Complete solution will also require the segmentation of the text into sentences/discourses, and indicies of words inside the discourse text (relative to the full text), and then classification of each discourse_text (use this baseline model).

In [None]:
df = df_train[["id", "discourse_text", "discourse_type"]]

In [None]:
# lemmatize (or stem?)

#lemmatizer = WordNetLemmatizer()
#df['discourse_text'] = df['discourse_text'].progress_apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]))

In [None]:
# Target label encoding
encoder = LabelEncoder()
labels = encoder.fit_transform(df.discourse_type)

# encoded target labels
df.loc[:, "label"] = labels

In [None]:
# train/test splitting
X_train, X_test, y_train, y_test = train_test_split(df.discourse_text, df.label, test_size=0.02)

In [None]:
# Model Pipeline and training

multinomialNB = Pipeline([
        ('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2), binary=True)),
        ('tfidf', TfidfTransformer(norm='l2', use_idf=True)),
        ('clf', MultinomialNB(alpha=0.1)),    
])

# training
multinomialNB.fit(X_train, y_train)

In [None]:
# prediction on test data
y_test_pred = multinomialNB.predict(X_test)

In [None]:
# Results on test set
print("\nTest Precision:", metrics.precision_score(y_test, y_test_pred, average='micro'))
print("\nTest Recall:", metrics.recall_score(y_test, y_test_pred, average='micro'))
print("\nClassification Report:\n", metrics.classification_report(y_test, y_test_pred))

In [None]:
def plot_confusion_matrix(y_test, y_scores, class_names):
    num_class = len(class_names)
    cm = metrics.confusion_matrix(y_test, y_scores)

    # normalize
    con = np.zeros((num_class, num_class))
    for x in range(num_class):
        for y in range(num_class):
            con[x,y] = cm[x,y]/np.sum(cm[x,:])

    plt.figure(figsize=(10,8))
    sns.set(font_scale=1.0) # for label size
    sns.heatmap(con, annot=True, fmt=".2", cmap='Blues',xticklabels= class_names , yticklabels= class_names)
    plt.show()

In [None]:
plot_confusion_matrix(y_test, y_test_pred, encoder.inverse_transform(np.unique(labels)))

In [None]:
# create df_test: a record from each discourse within test text docs
def expand_df_test(df: pd.DataFrame=df_test) -> pd.DataFrame:
    
    ids = []; data = []
    for id, text in zip(df.id, df.text):
        sentences = nltk.sent_tokenize(text)
        
        id_sentences = []; idx = 0 
        for sentence in sentences:
            id_sentence = []
            words = sentence.split()
            for w in words:
                id_sentence.append(idx)
                idx += 1
            id_sentences.append(id_sentence)
        data += list(zip([id] * len(sentences), sentences, id_sentences))
        
    tmp = pd.DataFrame(data, columns=['id', 'text', 'predictionstring'])
    return tmp

In [None]:
test = expand_df_test()

In [None]:
# prediction on test data

test["class"] = multinomialNB.predict(test.text)

In [None]:
test.loc[:, "class"] = encoder.inverse_transform(test["class"].values)

In [None]:
# submission
result = test[["id", "class", "predictionstring"]]
result['predictionstring'] = result['predictionstring'].apply(lambda x: ' '.join([str(i) for i in x]))

In [None]:
result.head()

In [None]:
result.to_csv("submission.csv", index=False)