In [None]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm

%matplotlib inline

In [None]:
train = pd.read_csv('../input/feedback-prize-2021/train.csv')
train.shape

In [None]:
train.head()

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x='discourse_type', data=train)

In [None]:
TEST_PATH = "../input/feedback-prize-2021/test"

def get_test_text(a_id):
    a_file = f"{TEST_PATH}/{a_id}.txt"
    with open(a_file, "r") as fp:
        txt = fp.read()
    return txt

def create_df_test():
    test_ids = [f[:-4] for f in os.listdir(TEST_PATH)] #Remove the last 4 characters ('.txt') in the filenames such as '0FB0700DAF44.txt'.
    test_data = []
    for test_id in test_ids:
        text = get_test_text(test_id)
        sentences = nltk.sent_tokenize(text)
        id_sentences = []
        idx = 0 
        for sentence in sentences:
            id_sentence = []
            words = sentence.split()
            # I created this heuristic for mapping words in sentences to "word indices"
            # This is not definitive and might have strong drawbacks and problems
            for w in words:
                id_sentence.append(idx)
                idx+=1
            id_sentences.append(id_sentence)
        test_data += list(zip([test_id] * len(sentences), sentences, id_sentences))
    df_test = pd.DataFrame(test_data, columns=['id', 'discourse_text', 'ids'])
    return df_test

In [None]:
df_test = create_df_test()
df_test.head()

In [None]:
df_test['predictionstring'] = df_test['ids'].apply(lambda x: ' '.join([str(i) for i in x]))
df_test.head()

In [None]:
df_test = df_test.drop('ids', axis=1)

In [None]:
df_test.head()

In [None]:
df = train.append(df_test)

## Preprocessing

In [None]:
def clean_text(text, remove_stopwords=True, stem_words=False, lemma=True):
    text = str(text).lower().split()
    
    if remove_stopwords:
        stops = stopwords.words("english")
        text = [x for x in text if not x in stops]
        
    text = ' '.join(text)
    
    text = re.sub(r"[-()\"#/<>!@&;*:<>{}`'+=~%|.!?,_]", " ", text)
    text = re.sub(r"\]", " ", text)
    text = re.sub(r"\[", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\\", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"  ", " ", text)
    text = re.sub(r"   ", " ", text)
    text = re.sub(r"   ", " ", text)
    text = re.sub(r"0x00", "", text)
    
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stem_words = [stemmer.stem(x) for x in text]
        text = " ".join(text)
        
    if lemma:
        text = text.split()
        lem = WordNetLemmatizer()
        lemmatized = [lem.lemmatize(x, "v") for x in text]
        text = " ".join(text)
        
    return text

In [None]:
%%time
df['cleaned_text'] = df.discourse_text.apply(clean_text)

In [None]:
df.iloc[0]['discourse_text']

In [None]:
df.iloc[0]['cleaned_text']

## TFIDF Vectorizer

In [None]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1000)
tfidf_vect.fit(df['cleaned_text'])

In [None]:
train_tfidf = tfidf_vect.fit_transform(df[:train.shape[0]]['cleaned_text'])
test_tfidf = tfidf_vect.transform(df[train.shape[0]:]['cleaned_text'])


In [None]:
features = tfidf_vect.get_feature_names()
features[:20]

## Passive Aggresive Classifier


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier


In [None]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)
pac =PassiveAggressiveClassifier(random_state = 42, n_jobs=-1)
scores = cross_val_score(pac, train_tfidf, train['discourse_type'], scoring='accuracy', cv=cv)

In [None]:
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
%%time
pac.fit(train_tfidf, train['discourse_type'])

In [None]:
pac_pred = pac.predict(test_tfidf) 

## Submission

In [None]:
submission_df = pd.DataFrame()
submission_df['id'] = df_test['id']
submission_df['class'] = pac_pred# label of y_predict
submission_df['predictionstring'] = df_test['predictionstring']


In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)