## Dataset Preparation

In [22]:
import pandas as pd
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

In [23]:
## read in the cleaned data
df = pd.read_csv('cleaned_quora_data.csv')

print(df.shape)

(1306122, 3)


In [24]:
## check if the dataframe has any null values
df = df.dropna()

In [25]:
print(df.shape)

(1305904, 3)


In [26]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0


#### Random sampling (without replacement) of insincere questions into training and validation sets

In [27]:
df['target'].value_counts()

0    1225105
1      80799
Name: target, dtype: int64

In [28]:
insincere_df = df[df['target'] == 1]

In [29]:
insincere_df.shape

(80799, 3)

In [30]:
insincere_df

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,united state become largest dictatorship world,1
30,00013ceca3f624b09f42,baby sweeter parent dark skin baby light skin ...,1
110,0004a7fcb2bf73076489,black support school choice mandatory sentenci...,1
114,00052793eaa287aff1e1,gay boy love cousin boy sexy dont know hot wan...,1
115,000537213b01fd77b58a,race smallest penis,1
...,...,...,...
1306093,fffeba722d9b371bd1b9,intimate relation cousin,1
1306094,fffee269360dd0d3947a,singer lyric voice head religious people say h...,1
1306099,ffff0e4ea1bb6e16feec,pakis smell curry shit,1
1306103,ffff3f0a2449ffe4b9ff,trump right usa benevolent towards neighbor me...,1


In [31]:
insincere_total_index = insincere_df.index.tolist()

In [32]:
len(insincere_total_index)

80799

In [33]:
insincere_train = insincere_df.sample(frac = 0.5, random_state=1)

In [12]:
40400

40400

In [34]:
insincere_train

Unnamed: 0,qid,question_text,target
237769,2e82342f91ea5a3c5795,best tip covince mom sex,1
1030932,ca044c186093e0e255d3,lend book indian spy,1
361589,46df8d430d3f7bf6f9ef,become ready sister sex,1
530402,67d7d1f32b7b309b0560,many congresspeople bad self serving,1
1036800,cb27a3a7eb1e6c3bcd30,wow iv shocked fondation human watching panora...,1
...,...,...,...
778988,989895b40fe73a6c7d12,muslim vehemently protesting bad incident towa...,1
767738,9669bc6961965174619f,muslim know west want,1
727064,8e5c7b983ea31efaec66,turk turkey support much terrorism middle east...,1
42905,0864d1ee406af642bd6f,american associate gun violence mental illness,1


In [35]:
insincere_train_index = insincere_train.index.tolist()

In [37]:
len(insincere_train_index)

40400

In [15]:
a = [237769, 361589]
insincere_train.loc[a]

Unnamed: 0,qid,question_text,target
237769,2e82342f91ea5a3c5795,best tip covince mom sex,1
361589,46df8d430d3f7bf6f9ef,become ready sister sex,1


In [38]:
insincere_valid_index = [index for index in insincere_total_index if index not in insincere_train_index]

In [39]:
len(insincere_valid_index)

40399

In [40]:
insincere_valid = insincere_df.loc[insincere_valid_index]

In [41]:
insincere_valid

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,united state become largest dictatorship world,1
115,000537213b01fd77b58a,race smallest penis,1
119,00056d45a1ce63856fc6,female find penis ugly,1
127,0005de07b07a17046e27,marry american woman green card much charge,1
144,00068875d7c82a5bcf88,european say superior race fact took year mid...,1
...,...,...,...
1306031,fffbada9affd3fc98e82,jew want endless immigration u israel wall sah...,1
1306093,fffeba722d9b371bd1b9,intimate relation cousin,1
1306099,ffff0e4ea1bb6e16feec,pakis smell curry shit,1
1306103,ffff3f0a2449ffe4b9ff,trump right usa benevolent towards neighbor me...,1


#### Random sampling (without replacement) of sincere questions into training and validation sets

In [42]:
sincere_df = df[df['target'] == 0]

In [44]:
sincere_df.shape

(1225105, 3)

In [49]:
sincere_total_index = sincere_df.index.tolist()

In [45]:
sincere_train = sincere_df.sample(n = 40400, random_state=1)

In [46]:
sincere_train

Unnamed: 0,qid,question_text,target
143806,1c26c9d5ba735163d9ae,dna test ancestrycom found year old daughter ...,0
787770,9a5545bf4a7c12976cff,thing republican democrat done american people...,0
1267118,f8520f8ce604009b1f2b,need math uceed examination,0
1056110,cef17a1b66ee35c577ca,intel sell mobile chip,0
120827,17a6aae43708ba30c8f6,factor police officer consider letting someone...,0
...,...,...,...
978327,bfa98e1a58f3b3a379a6,pg diploma course project management useful ci...,0
1174688,e633a123f7c88da46d9a,ph value curd,0
1303885,ff8f32e45639a9bbaf67,woman get pregnant period,0
1305669,ffe998c9b202c9c5f126,nativity studied upto telangana remaining stu...,0


In [48]:
sincere_train_index = sincere_train.index.tolist()

In [50]:
sincere_nontrain_index = [index for index in sincere_total_index if index not in sincere_train_index]

In [51]:
sincere_nontrain_df = sincere_df.loc[sincere_nontrain_index]

In [52]:
sincere_nontrain_df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,technical skill need computer science undergra...,0
1306118,ffffd431801e5a2f4861,m ece good job prospect usa like india job pre...,0
1306119,ffffd48fb36b63db010c,foam insulation toxic,0
1306120,ffffec519fa37cf60c78,one start research project based biochemistry ...,0


In [55]:
sincere_valid = sincere_nontrain_df.sample(n = 632917, random_state=1)

In [56]:
sincere_valid

Unnamed: 0,qid,question_text,target
401216,4e9ba5b8ca78e5a184a7,scared people reason,0
678257,84d5a69a6d9375c122bd,among major orchestra better orchestra u europe,0
536338,690e732343225b18209a,greatest brightness possible,0
1019169,c7b74b34d011e1f00126,well known south park israel,0
1078398,d35396b2e1ac8b5e849f,enlarge vocal range even possible,0
...,...,...,...
281333,37116145f7ac78d3bcb8,billionaire uneducated,0
1005091,c4f786637ea5c4bb3b2b,europe center left party right wing center rig...,0
642397,7dd407c48328d12a1085,treat thick greenish discharge pregnancy,0
1773,00587c0b433e662772c9,next big impact digital india according nostra...,0


#### Combine sincere_train and insincere_train to form training sets

In [57]:
train_df = pd.concat([insincere_train, sincere_train], ignore_index=True)

#### Combine sincere_valid and insincere_valid to form validation sets

In [58]:
valid_df = pd.concat([insincere_valid, sincere_valid], ignore_index=True)

In [62]:
print(train_df.shape)
print(train_df['target'].value_counts())

(80800, 3)
1    40400
0    40400
Name: target, dtype: int64


In [63]:
print(valid_df.shape)
print(valid_df['target'].value_counts())

(673316, 3)
0    632917
1     40399
Name: target, dtype: int64


In [64]:
## exporting training and validation data to csv
train_df.to_csv('training_data.csv', index=False)
valid_df.to_csv('validation_data.csv', index=False)