In [365]:
# Imports for common tasks, models imported later
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import fbeta_score
from bs4 import BeautifulSoup

%matplotlib inline

In [366]:
df = pd.read_csv("./data/formspring.csv")
df.shape

(12901, 14)

In [367]:
# See first few rows
df.head()

Unnamed: 0,userid,post,ques,ans,asker,ans1,severity1,bully1,ans2,severity2,bully2,ans3,severity3,bully3
0,aguitarplayer94,Q: what&#039;s your favorite song? :D<br>A: I ...,what&#039;s your favorite song? :D<br>,I like too many songs to have a favorite,,No,0,,No,0,,No,0,
1,aprilpooh15,Q: <3<br>A: </3 ? haha jk! <33,<3,</3 ? haha jk! <33,,No,0,,No,0,,No,0,
2,aprilpooh15,Q: &quot;hey angel you duh sexy&quot;<br>A: R...,&quot;hey angel you duh sexy&quot;,Really?!?! Thanks?! haha,,No,0,,No,0,,No,0,
3,aprilpooh15,Q: (:<br>A: ;(,(:,;(,,No,0,,No,0,,No,0,
4,aprilpooh15,Q: ******************MEOWWW*******************...,******************MEOWWW*************************,*RAWR*?,,No,0,,No,0,,No,0,


### Insights about the data set  
* userid - userid of a person giving the answer to a post  
* post - The post and it's reply. Separated by Q: and A.  
* ques and ans - The question and answer.  
  The same information is available in post, so we drop post  
* asker - id of the person asking the question.  
* ans #, severity #, bully # - answer by mechanical turk,   
  severity score assigned, bully word/phrase

In [368]:
df.drop(['post', 'asker'], axis = 1, inplace = True)

In [369]:
#df.sample(10)

In [370]:
df = df.replace('None', np.NaN)
#df.sample(10)

In [371]:
df['bully1']=df['bully1'].replace('N/A.','N/A')
df['bully1']=df['bully1'].replace('Na','N/A')
df['bully1']=df['bully1'].replace('N/a','N/A')
df['bully1']=df['bully1'].replace('na','N/A')
df['bully1']=df['bully1'].replace('no','No')

df['bully2']=df['bully2'].replace('N/A.','N/A')
df['bully2']=df['bully2'].replace('Na','N/A')
df['bully2']=df['bully2'].replace('N/a','N/A')
df['bully2']=df['bully2'].replace('na','N/A')
df['bully2']=df['bully2'].replace('no','No')

df['bully3']=df['bully3'].replace('N/A.','N/A')
df['bully3']=df['bully3'].replace('Na','N/A')
df['bully3']=df['bully3'].replace('N/a','N/A')
df['bully3']=df['bully3'].replace('na','N/A')
df['bully3']=df['bully3'].replace('no','No')

#Replace N/a means 'Neutral'
df['bully1']=df['bully1'].replace('N/A',0)
df['bully2']=df['bully2'].replace('N/A',0)
df['bully3']=df['bully3'].replace('N/A',0)

df['bully1']=df['bully1'].fillna(1)
df['bully2']=df['bully2'].fillna(1)
df['bully3']=df['bully3'].fillna(1)
#df['bully3'].value_counts()

In [372]:
df.head()

Unnamed: 0,userid,ques,ans,ans1,severity1,bully1,ans2,severity2,bully2,ans3,severity3,bully3
0,aguitarplayer94,what&#039;s your favorite song? :D<br>,I like too many songs to have a favorite,No,0,1,No,0,1,No,0,1
1,aprilpooh15,<3,</3 ? haha jk! <33,No,0,1,No,0,1,No,0,1
2,aprilpooh15,&quot;hey angel you duh sexy&quot;,Really?!?! Thanks?! haha,No,0,1,No,0,1,No,0,1
3,aprilpooh15,(:,;(,No,0,1,No,0,1,No,0,1
4,aprilpooh15,******************MEOWWW*************************,*RAWR*?,No,0,1,No,0,1,No,0,1


**Userid**
* User id has some corrupted rows. Some misplaced values.

* We find that for counts greater than 3, the rows are not corrupted.
  Drop rows which are truly corrupted

In [373]:
def impute_ans_columns(value):
    if value == 'Yes':
        return -1
    if value == 'No':
        return 1
    return -2

In [374]:
# Threshold value
userid_count_threshold = 1 #3

# Get row indexes which we will drop
#drop_indexes = df[df['userid'].map(df['userid'].value_counts()) <= userid_count_threshold].index

#df_dropped = df.drop(drop_indexes)

### Imputing Missing Values and Replacing values
1. ans columns
   NaN for ans columns indicates No Bully found. We will map every Yes to    1 and every No or NaN to 0

In [375]:
for col in ['ans1', 'ans2', 'ans3']:
    df[col] = df[col].apply(impute_ans_columns)

2. severity columns
* NaN/None was associated with a No(0) that is 0 severity mostly.
  So we will replace all NaN/None with 0.

In [376]:
def impute_severity_columns(value):
    '''Value will be a string. We need to convert it to int'''
    try:
        return int(value)
    except ValueError as e:
        return -2

In [377]:
for col in ['severity1', 'severity2', 'severity3']:
    df[col] = df[col].apply(impute_severity_columns)

In [378]:
df.head(10)

Unnamed: 0,userid,ques,ans,ans1,severity1,bully1,ans2,severity2,bully2,ans3,severity3,bully3
0,aguitarplayer94,what&#039;s your favorite song? :D<br>,I like too many songs to have a favorite,1,0,1,1,0,1,1,0,1
1,aprilpooh15,<3,</3 ? haha jk! <33,1,0,1,1,0,1,1,0,1
2,aprilpooh15,&quot;hey angel you duh sexy&quot;,Really?!?! Thanks?! haha,1,0,1,1,0,1,1,0,1
3,aprilpooh15,(:,;(,1,0,1,1,0,1,1,0,1
4,aprilpooh15,******************MEOWWW*************************,*RAWR*?,1,0,1,1,0,1,1,0,1
5,aprilpooh15,any makeup tips? i suck at doing my makeup lol,Sure! Like tell me wht u wnna know?! Like wht...,1,0,1,1,0,1,1,0,1
6,aprilpooh15,Apriiiiiiiiiiiill!!! I miss uuuu! It&#039;s Em...,EMMA hahahahah :D I MISSSSSeddd YUHHHHh 22222...,1,0,1,1,0,1,1,0,1
7,aprilpooh15,Are you a morning or night person?,Night 4shuree!!,1,0,1,1,0,1,1,-2,1
8,aprilpooh15,are you a trusting person?,alreadi answrd,1,0,1,1,0,1,1,0,1
9,aprilpooh15,are you a trusting person?,Yes veryy trustin person!!! May i help yuh:!,1,0,1,1,0,1,1,0,1


In [379]:
#sevirety range
for col in ['severity1', 'severity2', 'severity3']:
    print(col,":","min:", df_dropped[col].min(),"max:",df_dropped[col].max())

severity1 : min: -2 max: 10
severity2 : min: -2 max: 10
severity3 : min: -2 max: 10


In [380]:
for col in ['severity1', 'severity2', 'severity3']:
    print(df_dropped[col].value_counts())

 0     11468
-2       238
 1       199
 2       173
 3       140
 5       131
 6       104
 4        98
 7        76
 8        70
 10       52
 9        28
Name: severity1, dtype: int64
 0     11495
-2       261
 1       200
 2       173
 3       140
 5       106
 4        99
 10       70
 6        70
 7        67
 8        61
 9        35
Name: severity2, dtype: int64
 0     11468
-2       308
 1       208
 2       147
 3       135
 5       107
 4       101
 6        86
 7        80
 8        60
 10       47
 9        30
Name: severity3, dtype: int64


In [381]:
for col in ['bully1', 'bully2', 'bully3']:
    print(df_dropped[col].value_counts())

-2    12503
 0      223
 1       51
Name: bully1, dtype: int64
-2    12525
 0      196
 1       56
Name: bully2, dtype: int64
-2    12508
 0      200
 1       69
Name: bully3, dtype: int64


### Creating the target feature
Since in this dataset, we have not been given any target feature, we will create our own target feature.

Neutral = 0  ( where bully(i) == N/A )  
Bullying = -1 
Not Bulling = 1  
Any errors = -2

In [382]:
def create_target(value):
    if value == 0:
        return value
    if df['ans1'].equals(df['ans2']) and df['severity1']>0 and df['severity2']>0:
        return 'Negative'
    if df['ans1'].equals(df['ans3']) and df['severity1']>0 and df['severity3']>0:
        return 'Negative'
    if df['ans2'].equals(df['ans3']) and df['severity2']>0 and df['severity3']>0:
        return 'Negative'
    return 'Positive'

In [383]:
for col in ['bully1','bully2','bully3']:
    df['isBully'] = ['Neutral' if x == 0 else -3 for x in df[col]]
#    df['color'] = ['red' if x == 'Z' else 'green' for x in df['Set']]
    #df1['A'] = df1['A'].apply(lambda x: [y if y <= 9 else 11 for y in x])
#df.loc[df['age1'] - df['age2'] > 0, 'diff'] = df['age1'] - df['age2']

In [384]:
df['isBully'] = df['isBully'].apply(create_target)

In [385]:
df.head()


Unnamed: 0,userid,ques,ans,ans1,severity1,bully1,ans2,severity2,bully2,ans3,severity3,bully3,isBully
0,aguitarplayer94,what&#039;s your favorite song? :D<br>,I like too many songs to have a favorite,1,0,1,1,0,1,1,0,1,Positive
1,aprilpooh15,<3,</3 ? haha jk! <33,1,0,1,1,0,1,1,0,1,Positive
2,aprilpooh15,&quot;hey angel you duh sexy&quot;,Really?!?! Thanks?! haha,1,0,1,1,0,1,1,0,1,Positive
3,aprilpooh15,(:,;(,1,0,1,1,0,1,1,0,1,Positive
4,aprilpooh15,******************MEOWWW*************************,*RAWR*?,1,0,1,1,0,1,1,0,1,Positive


In [386]:
def clean_text(value):
    '''Value will be a string.'''
    txt = None
    try:
        soup = BeautifulSoup(value, 'html.parser')
        txt = soup.get_text()
        txt = txt.replace('"', '')
    except Exception as e:
        pass
    return txt

In [387]:
import string
def remove_punct(text):
    if not text:
        return text
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text


In [388]:
clean_text("&quot;hey angel you duh sexy&quot;")

'hey angel you duh sexy'

### Text Cleaning

In [389]:
for col in ['ques', 'ans']:
    df[col] = df[col].apply(clean_text)
    df[col] = df[col].apply(lambda x: remove_punct(x))






In [390]:
df.head(20)

Unnamed: 0,userid,ques,ans,ans1,severity1,bully1,ans2,severity2,bully2,ans3,severity3,bully3,isBully
0,aguitarplayer94,whats your favorite song D,I like too many songs to have a favorite,1,0,1,1,0,1,1,0,1,Positive
1,aprilpooh15,,haha jk,1,0,1,1,0,1,1,0,1,Positive
2,aprilpooh15,hey angel you duh sexy,Really Thanks haha,1,0,1,1,0,1,1,0,1,Positive
3,aprilpooh15,,,1,0,1,1,0,1,1,0,1,Positive
4,aprilpooh15,MEOWWW,RAWR,1,0,1,1,0,1,1,0,1,Positive
5,aprilpooh15,any makeup tips i suck at doing my makeup lol,Sure Like tell me wht u wnna know Like wht do...,1,0,1,1,0,1,1,0,1,Positive
6,aprilpooh15,Apriiiiiiiiiiiill I miss uuuu Its Emma btw haha,EMMA hahahahah D I MISSSSSeddd YUHHHHh,1,0,1,1,0,1,1,0,1,Positive
7,aprilpooh15,Are you a morning or night person,Night shuree,1,0,1,1,0,1,1,-2,1,Positive
8,aprilpooh15,are you a trusting person,alreadi answrd,1,0,1,1,0,1,1,0,1,Positive
9,aprilpooh15,are you a trusting person,Yes veryy trustin person May i help yuh,1,0,1,1,0,1,1,0,1,Positive


In [392]:
df = df.drop(['userid','ans1','severity1','bully1','ans2','severity2','bully2','ans3','severity3','bully3'], axis=1)

In [394]:
df.sample(10)

Unnamed: 0,ques,ans,isBully
11571,Have you ever seen someone fire a gun in person,Myself my Dad and a number of People in the...,Positive
2826,do yew kno yew uqlyy as fukc,do yew kno dat yo mom iz stank ass hoe lmao,Positive
9974,why would i spy on you well the reasons are en...,Its such a shame I thought I was special kno...,Positive
12287,ur a boy,a man yes,Positive
2505,would you rather be kingor queen of the univer...,Queen being that im a lady and Miss Universe ...,Positive
5820,How do you think of these questions Lol,i dont i steal them lol,Positive
8807,Do you ever think why bother,Frequently,Positive
12681,When you go to bed what do you sleep in Do yo...,I usally sleep in the Nude but I live with m...,Positive
5502,Does having a crack in my zz mean I need sawn up,thats is indeed what it means DD,Positive
5399,Do you believe in Fate,yes i do,Positive


In [395]:
df.to_csv('./data/processed_formspring.csv')