## Dividing the dataset into 2 for training and testing (splitting on users not posts)

In [1]:
import pandas as pd
import numpy as np
seed=23
np.random.seed(seed)

df = pd.read_csv("../tokens.csv",sep='\t')

df

Unnamed: 0,User,Post_Nr,Raw,Stemmed,Lemmatized,Label
0,3450,0,"sports betting number k in debt, feeling very ...","sport bet number k in debt , feel veri depress...","sports betting number k in debt , feeling very...",1
1,3450,1,finally accepted that you cannot win gambling ...,final accept that you can not win gambl relaps...,finally accepted that you can not win gambling...,1
2,3450,2,blocking software betfilter has anybody used t...,"block softwar betfilt has anybodi use this , w...",blocking software betfilter has anybody used t...,1
3,3450,3,prone to relapse when in debt? i find that whe...,prone to relaps when in debt ? i find that whe...,prone to relapse when in debt ? i find that wh...,1
4,3450,4,down to my last number on credit card i am num...,down to my last number on credit card i am num...,down to my last number on credit card i am num...,1
...,...,...,...,...,...,...
1071857,6377,22,moltres raid number people number number numbe...,moltr raid number peopl number number number a...,moltres raid number people number number numbe...,0
1071858,6377,23,number number number lv number,number number number lv number,number number number lv number,0
1071859,6377,24,sudowoodo number number number,sudowoodo number number number,sudowoodo number number number,0
1071860,6377,25,i like it shiny tho,i like it shini tho,i like it shiny tho,0


In [2]:
#percentage of posts by class
print(df['Label'].value_counts(normalize=True) * 100)
#percentage of users by class
print(df.groupby(["User"]).mean()['Label'].value_counts(normalize=True)*100)

0    94.876673
1     5.123327
Name: Label, dtype: float64
0.0    93.015332
1.0     6.984668
Name: Label, dtype: float64


In [3]:
#percentage of posts by class
print(df['Label'].value_counts())
#percentage of users by class
print(df.groupby(["User"]).mean()['Label'].value_counts())

0    1016947
1      54915
Name: Label, dtype: int64
0.0    2184
1.0     164
Name: Label, dtype: int64


In [4]:
def undersampleUsers(df):
    pos_users = df[df['Label']==1]['User'].unique()
    ctrl_users = df[df['Label']==0]['User'].unique()
    selected_ctrl_users = np.random.choice(ctrl_users,size=len(pos_users),replace=False)
    pos_df = df[df['User'].isin(pos_users)] 
    ctrl_df=df[df['User'].isin(selected_ctrl_users)]
    return pos_df,ctrl_df



In [5]:
# since the end goal of this task is to classify users and not posts
# we will be splitting the training and testing data by users 
# (ensuring that all of a users posts fall within a single set (training or test))
def divideByUsers(df,percentage):
    users = df['User'].unique()
    n_users = int(len(users)*percentage)
    first_group_users = np.random.choice(users,size=n_users, replace=False)
    second_group_users = np.array([user for user in users if not user in first_group_users])
    first_group =  df[df['User'].isin(first_group_users)]
    second_group =  df[df['User'].isin(second_group_users)]
    return first_group,second_group

In [6]:
balanced=False
while not balanced:
    positive,control = undersampleUsers(df)
    positive_train,positive_test = divideByUsers(positive,0.8)
    control_train,control_test = divideByUsers(control,0.8)
    train_df = positive_train.append(control_train)
    test_df = positive_test.append(control_test)
    #train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True) #shuffling all rows reordering by user might be better
    #test_df = test_df.sample(frac=1,random_state=seed).reset_index(drop=True) #shuffling all rows reordering by user might be better
    train_stats = train_df['Label'].value_counts(normalize=True)
    test_stats = test_df['Label'].value_counts(normalize=True)
    if 0.45<=train_stats[1] and 0.45<=test_stats[1]:
        balanced=True
    else:
       print(train_stats[1],test_stats[1])



0.43124975575442576 0.3878190129944926
0.4433835689101064 0.41458558315334776
0.40831339769471525 0.4517209517209517
0.41870894107285683 0.3349563397851193
0.39278277763741387 0.3453229828936738
0.4432287425149701 0.33555667584441773
0.3909222191406138 0.5033884327154076
0.4224241136942921 0.3556362709488307
0.41673334666933387 0.4091021365938002
0.4636284931948136 0.4029653886387411
0.40222133706335167 0.4122225011655848
0.43931211344094134 0.4426927321456724
0.40329251334499616 0.4113299475618316
0.41138056139010887 0.5445713233262731
0.4197943848221468 0.5637090773809523
0.43556398623279097 0.4943268497330282
0.4701859195550291 0.41880959913228183
0.4048504286904559 0.435089916793415
0.5124400444202916 0.3904510540385071
0.43623814157139384 0.40518682818631635
0.4341102553708958 0.3942737749338905
0.4481160626465915 0.30236011880870195
0.4923391006992293 0.3282900230878813
0.39976317673624495 0.42523574315222273
0.3973346495557749 0.4231725853254111
0.40780297732041165 0.43564784771

In [7]:
#percentage of posts by class
print("training set")
print("percentage of posts per class")
print(train_df['Label'].value_counts(normalize=True) * 100)
#percentage of users by class
print("percentage of users per class")
print(train_df.groupby(["User"]).mean()['Label'].value_counts(normalize=True)*100)

#posts by class
print("total posts per class")
print(train_df['Label'].value_counts())
#users by class
print("total users per class")
print(train_df.groupby(["User"]).mean()['Label'].value_counts())

training set
percentage of posts per class
0    54.21427
1    45.78573
Name: Label, dtype: float64
percentage of users per class
1.0    50.0
0.0    50.0
Name: Label, dtype: float64
total posts per class
0    53053
1    44805
Name: Label, dtype: int64
total users per class
1.0    131
0.0    131
Name: Label, dtype: int64


In [8]:
#percentage of posts by class
print("testing set")
print("percentage of posts per class")
print(test_df['Label'].value_counts(normalize=True) * 100)
#percentage of users by class
print("percentage of users per class")
print(test_df.groupby(["User"]).mean()['Label'].value_counts(normalize=True)*100)

#posts by class
print("total posts per class")
print(test_df['Label'].value_counts())
#users by class
print("total users per class")
print(test_df.groupby(["User"]).mean()['Label'].value_counts())

testing set
percentage of posts per class
0    54.710388
1    45.289612
Name: Label, dtype: float64
percentage of users per class
1.0    50.0
0.0    50.0
Name: Label, dtype: float64
total posts per class
0    12213
1    10110
Name: Label, dtype: int64
total users per class
1.0    33
0.0    33
Name: Label, dtype: int64


In [9]:

train_df.to_csv("../train_df.csv", sep='\t', index=False)
test_df.to_csv("../test_df.csv", sep='\t', index=False)