In [138]:
import pandas as pd
import ast

import sklearn
from sklearn.model_selection import train_test_split

import random

## 1. Loading the data

In [2]:
data = pd.read_csv('goodnotes_submission.csv')

In [21]:
print("TotalCount:",len(data))
data.head(5)

TotalCount: 679


Unnamed: 0,submission_id,submission_title,submission_selftext,submission_link_flair_text,reply_body,all_text,reply_body_list
0,aglcrj,Goodnotes 4 vs. Goodnotes 5 right now,I have used Goodnotes 4 for work a ton. And I...,,"[""I'm getting a ton of bugs with 5 as well (sn...",Goodnotes 4 vs. Goodnotes 5 right nowI have us...,[I'm getting a ton of bugs with 5 as well (sna...
1,agoowm,The bundle is available !,,,['Thank you. I have been waiting!\n\n&amp;#x20...,The bundle is available !The bundle is availab...,[Thank you. I have been waiting!\n\n&amp;#x200...
2,agprta,Finally... GoodNotes 5! Did I miss anything in...,,Review,['Man i imagined you would be a lot more popul...,Finally... GoodNotes 5! Did I miss anything in...,[Man i imagined you would be a lot more popula...
3,agpzxb,What happened to the pen (Goodnotes 5)?,I just got Goodnotes 5 and I was so excited fo...,,"[""Have you tried the ball pen? That was the cl...",What happened to the pen (Goodnotes 5)?I just ...,[Have you tried the ball pen? That was the clo...
4,agq8qv,Non Apple Pencil styluses on GOodnotes 5?,I've been using a Wacom Bamboo stylus with Goo...,,['According to the [review at Macstories](http...,Non Apple Pencil styluses on GOodnotes 5?I've ...,[According to the [review at Macstories](https...


## 2. Converting reply_body

In [18]:
data['reply_body_list']=data['reply_body'].apply(ast.literal_eval)

In [19]:
data['reply_body_list']

0      [I'm getting a ton of bugs with 5 as well (sna...
1      [Thank you. I have been waiting!\n\n&amp;#x200...
2      [Man i imagined you would be a lot more popula...
3      [Have you tried the ball pen? That was the clo...
4      [According to the [review at Macstories](https...
                             ...                        
674    [&gt;if she is dishonest about being a virgin ...
675    [I updated to beta 2 yesterday and have been u...
676    [Sex, Upvoted you. Please upvote me back, Ive ...
677    [Something I also wish to know. I usually put ...
678    [I’m intrigued., I've kinda given up on buying...
Name: reply_body_list, Length: 679, dtype: object

In [47]:
data['submission_selftext'] = data['submission_selftext'].fillna("")

## 3. Combining text in all_text

In [26]:
def firstNComments(commentList, n=3):
    m = len(commentList)
    if m==0:
        return ""
    
    return " ".join(commentList[:min(n,m)])



In [60]:
data['all_text'] = data.apply(lambda x: " ".join([ \
     x['submission_title'], \
     x['submission_selftext'], \
     firstNComments(x['reply_body_list'], n=4) \
    ] ),axis=1)

In [64]:
data['all_text'][0]

"Goodnotes 4 vs. Goodnotes 5 right now I have used Goodnotes 4 for work a ton.  And I do mean a ton.  I bought and downloaded 5 yesterday, transfer was easy.  I realized I can't use the Mac App with 5.  Right?  Changes I made in 5 aren't synced to the desktop app.  Also I kept getting a syncing error in 5.  What was it syncing with?  I am going to continue to play with it but I am not sure I feel comfortable diving in yet. I'm getting a ton of bugs with 5 as well (snappy lines, no response at times) and for some reason the ability to sync to google drive and to download multiple files at once from google drive is gone.\n\n I think they need some time to cope with the new launch. Goodnotes 5 is not yet compatible with the desktop app as it says in the release notes. There will be more features added in the near future.  I haven't downloaded GN5 yet but watched a walkthrough and I did see that there's a snap option to check and uncheck, I think somewhere in pen options. Hope that helps! 

In [65]:
data['all_text'][2]

'Finally... GoodNotes 5! Did I miss anything in this video? Tried to point out all the "new" features.  Man i imagined you would be a lot more popular in this sub. Love your channel :)  Thank you :)'

## 4. Split into labeled and unlabeled dataframes

In [70]:
labelled_df = data[data['submission_link_flair_text'].notna()]
unlabelled_df = data[data['submission_link_flair_text'].isna()]

In [72]:
print("Labeled size:",len(labelled_df))
print("Unlabeled size:",len(unlabelled_df))

Labeled size: 241
Unlabeled size: 438


## 5. Combining all Question tags

In [81]:
labelled_df['submission_link_flair_text'].unique()

array(['Review', 'Question - iPad', 'Question - Other', 'Question - Mac',
       'Stylus problems', 'Templates', 'Question - iPhone'], dtype=object)

In [84]:
labelled_df['submission_link_flair_text'] = labelled_df['submission_link_flair_text'].apply(lambda x: "Question" if "Question" in x else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labelled_df['submission_link_flair_text'] = labelled_df['submission_link_flair_text'].apply(lambda x: "Question" if "Question" in x else x)


In [85]:
labelled_df['submission_link_flair_text'].unique()

array(['Review', 'Question', 'Stylus problems', 'Templates'], dtype=object)

## 7. Split the dataset into Training and TestSet

In [99]:
all_tag_df = labelled_df[['all_text','submission_link_flair_text']] \
    .rename(columns={"all_text":"input_text","submission_link_flair_text":"target_text"})

In [127]:
train_tag_df,val_tag_df=train_test_split(all_tag_df, test_size=0.2,random_state=42)

In [119]:
print("Training size",len(train_tag_df))
print("Validation size",len(val_tag_df))

Training size 192
Validation size 49


## 8. Exporting Training Data

In [120]:
train_tag_df.to_csv('singletask_noupsampling_train.csv',index=False)
val_tag_df.to_csv('singlatask_noupsampling_val.csv',index=False)
unlabelled_df.to_csv('unlabelled_df.csv',index=False)

## 9. Upsample

In [197]:
train_tag_df.groupby(by='target_text').count().reset_index()

Unnamed: 0,target_text,input_text
0,Question,141
1,Review,15
2,Stylus problems,3
3,Templates,33


In [199]:
review_size = sum(train_tag_df['target_text']=='Review')
stylus_size = sum(train_tag_df['target_text']=='Stylus problems')
template_size = sum(train_tag_df['target_text']=='Templates')

Randomly sampling desired number of samples

In [201]:
random_state = 12321

upsampled_reviews = train_tag_df[train_tag_df['target_text']=='Review'] \
    .sample(review_size*3,replace=True, random_state=random_state)

upsampled_problems = train_tag_df[train_tag_df['target_text']=='Stylus problems'] \
    .sample(stylus_size*3,replace=True, random_state=random_state)

upsampled_templates = train_tag_df[train_tag_df['target_text']=='Templates'] \
    .sample(template_size*2,replace=True, random_state=random_state)


Concatinating upsampled data with original train_tag_df, and reshuffling

In [202]:
train_tag_df_extended = pd.concat([train_tag_df,upsampled_reviews,upsampled_problems, upsampled_templates]) \
    .sample(frac=1, random_state=random_state).reset_index(drop=True)

Verifying upsampled results

In [203]:
train_tag_df_extended.groupby(by='target_text').count().reset_index()

Unnamed: 0,target_text,input_text
0,Question,141
1,Review,60
2,Stylus problems,12
3,Templates,99


## 10. Exporing the expanded df

In [204]:
train_tag_df_extended.to_csv('singletask_train.csv', index=False)