# Data Cleaning


<hr>

### Load Library

In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import re 
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import nltk

### Read Data

In [5]:
df_dom_main = pd.read_csv('data/domesticviolence_subs-pushshift.csv')
df_sw_main = pd.read_csv('data/suicidewatch_subs-pushshift.csv')

In [6]:
# Check shape of each dataset
dom_shape = df_dom_main.shape
sw_shape = df_sw_main.shape

print('Domestic violence total rows and columns: {}'.format(dom_shape))
print('Suicide watch total rows and columns: {}'.format(sw_shape))

Domestic violence total rows and columns: (975, 13)
Suicide watch total rows and columns: (1000, 13)


Domestic violence and Suicide watch has the same number of columns, hence we can append them into one dataset.
<hr>

### Merge Data

In [7]:
df = pd.concat([df_dom_main[['title', 'selftext', 'subreddit']],
                df_sw_main[['title', 'selftext', 'subreddit']]], ignore_index=True)
df.to_csv('data/final_sub_merge.csv', index=False)

In [8]:
df.head()

Unnamed: 0,title,selftext,subreddit
0,Domestic Violence - how to navigate this when ...,Hello - I'm seeking out advice/support. I'm no...,domesticviolence
1,"Please give me advice, I honestly don't know t...",\nThe person in question isn't my ex rather th...,domesticviolence
2,"I Finally Escaped, Again","I feel stupid for going back to my ex, but I h...",domesticviolence
3,When the man who mentally and physically abuse...,,domesticviolence
4,I feel bad for dobbing him in...,"Hi all,\n\nSo today I got so upset that I call...",domesticviolence


### Read Merge Data 

In [9]:
# Read in merged csv
df = pd.read_csv('data/final_sub_merge.csv')

In [10]:
# Replacing nulls with emptry string in selftexts column
df.replace(np.nan, '', inplace=True)

In [11]:
# Changing my target to be numerical 
df['target'] = df['subreddit'].map({'domesticviolence': 1, 'SuicideWatch': 0})

In [12]:
df.drop('subreddit', axis=1, inplace=True)

In [13]:
# Creating a mask to get rid of '[removed]'
df = df[df.selftext != '[removed]']

In [14]:
# Combining selftexts to titles
df['selftext'] = df['title'].map(str) + df['selftext']

In [15]:
# Dropping selftexts column
df = df.drop('title', axis=1)

In [16]:
# Renumbering my index properly
df.reset_index(inplace=True,drop=True)

In [17]:
# Double checking on index numbering
df.tail()

Unnamed: 0,selftext,target
1868,Feel like I can’t express my suicidal thoughts...,0
1869,The thought about having a normal life. \n\nI ...,0
1870,A shotgun would really come in handy right now.,0
1871,Ambulance workers and doctors keep mistreating...,0
1872,"I used to say that I didn’t want to die, just ...",0


In [18]:
df.shape

(1873, 2)

In [19]:
# Ensuring all missing values are being handled
df.isnull().sum()

selftext    0
target      0
dtype: int64

### Mirror Duplicate Post

In [20]:
#Finding out number of duplicated rows 
print(df['selftext'].duplicated)

<bound method Series.duplicated of 0       Domestic Violence - how to navigate this when ...
1       Please give me advice, I honestly don't know t...
2       I Finally Escaped, AgainI feel stupid for goin...
3       When the man who mentally and physically abuse...
4       I feel bad for dobbing him in...Hi all,\n\nSo ...
                              ...                        
1868    Feel like I can’t express my suicidal thoughts...
1869    The thought about having a normal life. \n\nI ...
1870      A shotgun would really come in handy right now.
1871    Ambulance workers and doctors keep mistreating...
1872    I used to say that I didn’t want to die, just ...
Name: selftext, Length: 1873, dtype: object>


In [21]:
#Removal of duplicate rows
df = df.drop_duplicates('selftext')

In [22]:
df.shape

(1349, 2)

### Export clean dataset

In [23]:
df.to_csv('data/data_modelling.csv')