In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%config Completer.use_jedi = False

In [None]:
import os
import shutil

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
files_train = os.listdir('/kaggle/input/feedback-prize-2021/train')
files_test = os.listdir('/kaggle/input/feedback-prize-2021/test')

In [None]:
df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')

In [None]:
label_ordered_list = df['discourse_type'].value_counts().index.tolist()[::-1]

In [None]:
df['char_len'] = (df['discourse_end'] - df['discourse_start']).astype(int)
df['word_len'] = df['predictionstring'].str.split().apply(len)

In [None]:
label_ratio = []
for i, txt in tqdm(df.groupby('id')): 
    
    txt_id = txt['id'].values[0]
    txt_file = f"/kaggle/input/feedback-prize-2021/train/{txt_id}.txt"
    
    with open(txt_file, 'r') as file:
        txt_data = file.read()
        
    len_lbls = txt['char_len'].sum()
    len_txt = len(txt_data)
    ratio = len_lbls/len_txt

    label_ratio.append(pd.DataFrame({'id': [txt_id], 'ratio':[ratio]}))
    
label_ratio = pd.concat(label_ratio).reset_index(drop = True)

In [None]:
files_to_keep = label_ratio['id'][label_ratio['ratio'] > 0.80]
df_clean = df.set_index('id').loc[files_to_keep].reset_index().copy()

In [None]:
df_clean = df_clean.set_index('id')
for idx, group in tqdm(df_clean.groupby(df_clean.index)):
    txt_file_path = f'/kaggle/input/feedback-prize-2021/train/{idx}.txt'
    with open(txt_file_path, 'r') as file:
        text_data = file.read()
    txt_len = len(text_data.split())
    df_clean.loc[idx, 'txt_len'] = int(txt_len)
df_clean = df_clean.reset_index()

In [None]:
df_clean['predictionarray'] = df_clean.progress_apply(lambda x: np.array(x['predictionstring'].split(), dtype = int) / x['txt_len'], axis = 1)

In [None]:
df_clean = df_clean[['predictionarray', 'discourse_type']].explode('predictionarray')
df_clean['discourse_type'] = df_clean['discourse_type'].astype('category')

In [None]:
X = df_clean[['predictionarray']]
y = df_clean['discourse_type']

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', verbose = 1, n_jobs = 4)

In [None]:
model.fit(X, y)

In [None]:
import matplotlib.pyplot as plt

X_test = np.expand_dims(np.linspace(0, 1, num = 1001), axis = 1)
y_test = model.predict_proba(X_test).argmax(axis = -1)

labels = y.cat.categories
plt.plot(X_test, y_test)
plt.yticks(range(len(labels)), labels);

In [None]:
sub = []
for txt_file in files_test:
    
    txt_file_path = f'/kaggle/input/feedback-prize-2021/test/{txt_file}'
    with open(txt_file_path, 'r') as file:
        text_data = file.read()
    txt_len = len(text_data.split())
    
    X_test = np.expand_dims((np.arange(txt_len) + 1) / txt_len, axis = 1)
    y_test = model.predict_proba(X_test).argmax(axis = -1)

    predictions = {label: ' '.join(np.where(y_test == i)[0].astype(str)) for i, label in enumerate(labels)}

    sub.append(pd.DataFrame({
        'id': txt_file.split('.')[0],
        'class': predictions.keys(),
        'predictionstring': predictions.values()
    }))
    
    
submission = pd.concat(sub).reset_index(drop = True)
submission = submission[submission['predictionstring'] != '']

In [None]:
submission.to_csv('submission.csv', index = False)
submission