In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visiualization
import seaborn as sns # data visisualization like distribytion chart, matrix plot, heat maps
import sklearn # scikit library for machine learning

!pip install altair
!pip install datapane
import altair as alt # declarative statistical visualization library for Python, based on Vega and Vega-Lite.
import datapane as dp # open source framework which makes it easy to build and share reports

# 1. Data set import****

In [None]:

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 2. Data Preprocessing

In [None]:
training_data = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
submission_data=pd.read_csv('../input/quora-insincere-questions-classification/sample_submission.csv')

In [None]:
print('Training Data',training_data.info()) # find records and attributes
'\n'
print('Training data column headings:',training_data.columns) # column headings
'\n'
print('Training data shape:',training_data.shape) # Rows & columns
'\n'
print('Training Data',training_data.head()) #top 5 records
'\n'
print('Sample Training data',training_data.sample(10)) # sample 10 records

In [None]:
print('Test Data information',test_data.info()) # find records and attributes
'\n'
print('Test data column headings:',test_data.columns) # column headings
'\n'
print('Test data shape:',test_data.shape) # Rows & columns
'\n'
print('Test Data',test_data.head()) #top 5 records
'\n'
print('Sample Test data',test_data.sample(10)) # sample 10 records

### Checking insincere and sincere message content

In [None]:
training_data[training_data.target==1][:5]  # top 5 labeled insincere questions

In [None]:
training_data[training_data.target==0][:5] # top 5 labeled sincere questions

In [None]:
#Frequency count of sincere and insincere question texts:
count=training_data['target'].value_counts()
print('Total Counts of both sets'.format(),count)

plt.figure(figsize=(15,5))
sns.countplot(y="target",
              palette =['green','red'],
              data=training_data)
plt.suptitle("Frequency of Sincere Questions (0) & Insincere Questions (1)")
plt.show()

## The Training dataset is unbalanced more towards sincere questions

# 3. Exploratory data analysis

In [None]:
sns.distplot(training_data['target'].value_counts(),
            kde=True)

In [None]:
insincere_label=training_data[training_data['target']== 1]['question_text']
print(insincere_label.head())  #top insincere questions

In [None]:
sincere_label=training_data[training_data['target']== 0]['question_text']
print(sincere_label.head())  #top sincere questions

In [None]:
#Frequency of insincere and sincere questions
#Function for checking word length
def freq_len(data):
    return len(data)

In [None]:
freq_sincere = sincere_label.str.split().apply(lambda z:freq_len(z))
print("Sincere Questions Length:" + str(freq_sincere))

In [None]:
freq_insincere = insincere_label.str.split().apply(lambda z:freq_len(z))
print("Insincere Questions Length:" + str(freq_insincere))

In [None]:
#Visualizing distributions of length of insincere and sincere questions in the entire training data

fig,axes=plt.subplots(1,2)

sns.distplot(freq_insincere,ax=axes[0],color='red')

sns.distplot(freq_sincere,ax=axes[1],color='green')

plt.show()

### Clearly we see people using more words in insincere questions than in sincere questions

# Frequency count of Punctuations

In [None]:
import string

In [None]:
freq_sincere_punctuations= sincere_label.apply(lambda z: len([c for c in str(z) if c in string.punctuation])) #punctuation in insincere questions

freq_insincere_punctuations= insincere_label.apply(lambda z:len([c for c in str(z) if c in string.punctuation])) #punctuations in sincere questions

#Distribution plot for length of punctuations in insincere and sincere questions

fig,axes=plt.subplots(1,2)

sns.distplot(freq_insincere_punctuations,ax=axes[0],color='red')

sns.distplot(freq_sincere_punctuations,ax=axes[0],color='green')

plt.show()

# Frequency Count of Stopwords

### Importing nltk library and stopwords

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
stop_words=set(stopwords.words('english')) #designating stopwords

freq_insincere_stops= insincere_label.apply(lambda z : np.mean([len(z) for w in str(z).split()])) #stopwords in insincere questions

freq_sincere_stops= sincere_label.apply(lambda z : np.mean([len(z) for w in str(z).split()])) #stopwords in sincere questions

#Distribution plot for stopwords in insincere and sincere questions

fig,axes=plt.subplots(1,2)

sns.distplot(freq_insincere_stops,ax=axes[0],color='red')

sns.distplot(freq_sincere_stops,ax=axes[1],color='green')

plt.show()

# Visualizing occurence of words though word clouds

In [None]:
!pip install wordcloud
import wordcloud
from wordcloud import WordCloud, STOPWORDS 

In [None]:
def display_cloud(data,color,figsize):
    plt.subplots(figsize=figsize)
    wc = WordCloud(stopwords=STOPWORDS,
                   background_color="white", 
                   contour_width=2, 
                   contour_color=color,
                   max_words=2000, 
                   max_font_size=256,
                   random_state=42)
    wc.generate(' '.join(data))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis('off')
    plt.show()
    
display_cloud(training_data['question_text'],color='red',figsize=(15,15)) #WordCloud for the training daata

In [None]:
display_cloud(insincere_label,'red',figsize=(5,5))

In [None]:
display_cloud(sincere_label,'green',figsize=(5,5)) #Word cloud for sincere questions

# Most frequently occuring words in descending order

In [None]:
from collections import Counter

In [None]:
#Simplified counter function
def create_corpus(x=0):
    corpus=[]
    for x in training_data[training_data['target']==x]['question_text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
corpus=create_corpus(x=0)
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:100]:
    if (word not in stop_words) :
        x.append(word)
        y.append(count)
        
plt.figure(figsize=(15,10))
sns.barplot(x=y,y=x)
plt.title("Most frequent words in descending order")
plt.xlabel("frequency")
plt.show()

# n-gram analysis

In [None]:
from collections import defaultdict

In [None]:
#Gram analysis on Training set- Bigram and Trigram

stopword=set(stopwords.words('english'))

def gram_analysis(data,gram):
    tokens=[t for t in data.lower().split(" ") if t!="" if t not in stopword]
    ngrams=zip(*[tokens[i:] for i in range(gram)])
    final_tokens=[" ".join(z) for z in ngrams]
    return final_tokens

In [None]:
#Create frequency grams for analysis
    
def create_dict(data,grams):
    freq_dict=defaultdict(int)
    for sentence in data:
        for tokens in gram_analysis(sentence,grams):
            freq_dict[tokens]+=1
    return freq_dict

In [None]:
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["n_gram_words"].values[::-1],
        x=df["n_gram_frequency"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

In [None]:
def create_new_df(freq_dict,):
    freq_df=pd.DataFrame(sorted(freq_dict.items(),key=lambda z:z[1])[::-1])
    freq_df.columns=['n_gram_words','n_gram_frequency']
    #print(freq_df.head())
    #plt.barh(freq_df['n_gram_words'][:20],freq_df['n_gram_frequency'][:20],linewidth=0.3)
    #plt.show()
    trace=horizontal_bar_chart(freq_df[:20],'orange')
    return trace
    

In [None]:
def plot_grams(trace_zero,trace_one):
    fig = subplots.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of positive Questions", 
                                          "Frequent words of negative Questions"])
    fig.append_trace(trace_zero, 1, 1)
    fig.append_trace(trace_ones, 1, 2)
    fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
    py.iplot(fig, filename='word-plots')
    
    

In [None]:
train_df_zero=training_data[training_data['target']== 0]['question_text']

train_df_ones=training_data[training_data['target']== 1]['question_text']

In [None]:
train_df_ones.head()

In [None]:
print("Bi-gram analysis")
freq_train_df_zero=create_dict(train_df_zero[:200],2)
print(freq_train_df_zero)

In [None]:
trace_zero=create_new_df(freq_train_df_zero)

freq_train_df_ones=create_dict(train_df_ones[:200],2)

print(freq_train_df_zero)

In [None]:
trace_ones=create_new_df(freq_train_df_ones)
plot_grams(trace_zero,trace_ones)
print("Tri-gram analysis")

In [None]:
freq_train_df_zero=create_dict(train_df_zero[:200],3)
#print(freq_train_df_zero)

In [None]:
trace_zero=create_new_df(freq_train_df_zero)
freq_train_df_ones=create_dict(train_df_ones[:200],3)
#print(freq_train_df_zero)

In [None]:
trace_ones=create_new_df(freq_train_df_ones)
plot_grams(trace_zero,trace_ones)

In [None]:
#sincere_wordcloud = display_cloud(sincere_label,'green',figsize=(15,15))
#insincere_wordcloud = display_cloud(insincere_label,'green',figsize=(15,15))

In [None]:
#Publishing descriptive analysis results

#dp.Report(
#    dp.Plot(plot), 
#    dp.Table(df)
#).publish(name='Covid Report', open=True) 

#dp.Report(
#    dp.Plot(sincere_wordcloud),
#    dp.Plot(insincere_wordcloud)
#).publish(name='Quora Report', open=True) 

# Predictive analysis

In [None]:
#Libraries
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1306122):
    review = re.sub('[^a-zA-Z]', ' ', training_data['question_text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = training_data.iloc[:, -1].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
#Publishing output of analysis
#dp.Report(
#    dp.Plot(plot), 
#    dp.Table(df)
#).publish(name='Covid Report', open=True) 