# **Loading Packages and Data**

In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import math
from wordcloud import WordCloud # for words statistics

In [None]:
# Training data
train_data = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
# Testing data
test_data = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")

# **Data Exploration**

In [None]:
# Show some information 
train_data.info()
test_data.info()

In [None]:
train_data.head(10)

In [None]:
test_data.head(10)

In [None]:
sincere_questions=train_data[train_data['target']==0]
insincere_questions=train_data[train_data['target']==1]
num_of_sinc=sincere_questions.shape[0]
num_of_insinc=insincere_questions.shape[0]
percentage_of_sincere=((num_of_sinc)/(num_of_sinc+num_of_insinc))*100
percentage_of_insincere=((num_of_insinc)/(num_of_sinc+num_of_insinc))*100
print("No. of sincere questions",num_of_sinc,"Percentage:",math.floor(percentage_of_sincere),"%")
print("No. of Insincere questions",num_of_insinc,"Percentage:",math.ceil(percentage_of_insincere),"%")
q=[num_of_sinc,num_of_insinc]
labels=['Sincere Questions','Insincere Questions']
plt.bar(labels,q)
plt.title("Target Distribution")
plt.show()

# **Word Cloud**

In [None]:
def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(0,100%, 1%)")
wordcloud = WordCloud(background_color="white", width=3000, height=2000, max_words=500).generate(" ".join(sincere_questions.question_text))
wordcloud.recolor(color_func = black_color_func)
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('wordcloud.png')

In [None]:
# 4/5 of the questions wil be used to train
# the rest of them are used for validations
train_ratio = 0.8 
num_of_train  = int(train_ratio * (num_of_sinc + num_of_insinc))
train_sen = [] # array of training questions
val_sen = [] # array of validating questions
test_sen = [] # array of testing questions

for i in range(0, len(train_data['question_text'])):
    if i < num_of_train:
        train_sen.append(train_data['question_text'].loc[i])
    else:
        val_sen.append(train_data['question_text'].loc[i])
        
for i in range(0, len(test_data['question_text'])):
    test_sen.append(test_data['question_text'].loc[i])

train_label = [] # array of training questions' labels
val_label = [] # array of validating questions' labels
for i in range(0, len(train_data['target'])):
    if i < num_of_train:
        train_label.append(float(train_data['target'].loc[i]))
    else:
        val_label.append(float(train_data['target'].loc[i]))

<h3>Feature Engineering</h3>