In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette
%matplotlib inline

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, metrics, linear_model

from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [None]:
ls ../input/quora-insincere-questions-classification

In [None]:
train_path = "../input/quora-insincere-questions-classification/train.csv"
test_path = "../input/quora-insincere-questions-classification/test.csv"
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [None]:
print(f"There are {train_data.shape[0]} Rows and {train_data.shape[1]} Columns inside train data")
print(f"There are {train_data.shape[0]} questions in total in the training dataset")
print(f"There are {test_data.shape[0]} Rows and {test_data.shape[1]} Columns inside test data")
print(f"There are {test_data.shape[0]} questions in total in the test dataset")

In [None]:
train_data.head(30)

In [None]:
test_data.head()

In [None]:
target_count = train_data['target'].value_counts()
print(target_count)

In [None]:
# Data for barchart
barchart_data = go.Bar(
    x=target_count.index,
    y=target_count.values,
    marker=dict(
        color=target_count.values,
        colorscale = 'Picnic',
        reversescale = True
    ),
)
# Layout with title
layout = go.Layout(
    title='Target Count',
    font=dict(size=18)
)

fig = go.Figure(data=[barchart_data], layout=layout)
py.iplot(fig, filename="TargetCount")

In [None]:
labels = (np.array(target_count.index))
sizes = (np.array((target_count / target_count.sum())*100))

piechart_trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Target Distribution',
    font=dict(size=18),
    width=600,
    height=600,
)
data = [piechart_trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="target_distribution")

### Inference
#### From here we can say that there is only 6.19 % of insincere questions
#### This clearly tells us that the samples to predict from is pretty low i.e. a case of undersampling


In [None]:
from sklearn.utils import resample

sincere_data = train_data[train_data["target"] == 0]
insincere_data = train_data[train_data["target"] == 1]
train_sampled = pd.concat([resample(sincere_data, replace = True, n_samples = len(insincere_data)*4), insincere_data])
train_sampled

In [None]:
y = train_sampled['target']
y.value_counts().plot(kind='bar', rot=0)

In [None]:
# Word cloud
from wordcloud import WordCloud, STOPWORDS

def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0,16.0), 
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'one', 'br', 'Po', 'th', 'sayi', 'fo', 'Unknown'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwords,
        max_words=max_words,
        max_font_size=max_font_size, 
        random_state=42,
        width=800, 
        height=400,
        mask=mask
    )
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    plt.imshow(wordcloud)
    plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
    
plot_wordcloud(train_data["question_text"], title="Word Cloud of Questions")

In [None]:
# Word cloud for sincere questions
plot_wordcloud(train_data[train_data["target"] == 0]["question_text"], title="Word Cloud of Sincere Questions")

In [None]:
# Word cloud for insincere questions
plot_wordcloud(train_data[train_data["target"] == 1]["question_text"], title="Word Cloud of Insincere Questions")

## Preprocessing
Cleaning the questions


In [None]:
import re

def clean_text(text):

  # Remove HTML Tags
  text = re.sub(re.compile('<.*?>'), '', text)

  # Remove [\], ['], ["]
  text = re.sub(r'\\', '', text)
  text = re.sub(r'\"', '', text)
  text = re.sub(r'\'', '', text)

  # Remove number
  text = re.sub('[0-9]{5,}','#####', text)
  text = re.sub('[0-9]{4,}','####', text)
  text = re.sub('[0-9]{3,}','###', text)
  text = re.sub('[0-9]{2,}','##', text)

  ## Remove Roman words
  roman = re.compile(r'^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')
  text = roman.sub(r'', text)

  # Convert all text to lowercase
  text = text.strip().lower()

  # Replace punctuation chars with spaces
  filters = '!"\'#$%@&*()+_-;:<=>.?{}|`\\^\t\n'
  translate_dict = dict((c, " ") for c in filters)
  translate_map = str.maketrans(translate_dict)
  text = text.translate(translate_map)

  return text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 3))

X = vectorizer.fit_transform(train_sampled['question_text'])
x = vectorizer.transform(test_data['question_text'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
logistic = linear_model.LogisticRegression(solver='sag')
logistic.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

def get_f1(model, name):
  y_train_pred, y_pred = model.predict(X_train), model.predict(X_test)
  print(classification_report(y_test, y_pred), '\n')

  print('{} model with F1 score = {}'.format(name, f1_score(y_test, y_pred)))

get_f1(logistic, 'LogisticRegression')


In [None]:
# Prdiction on test data 
test_preds = logistic.predict(x)

In [None]:
# xgboost
import xgboost as xgb
xgb = xgb.XGBClassifier()
xgb.fit(X_train, y_train)


In [None]:
get_f1(xgb, 'XGBClassifier')

In [None]:
model = xgb
test_preds = model.predict(x)

## Result

In [None]:
output = pd.DataFrame({
    "qid":test_data["qid"].values, 
    "prediction": test_preds
}) 
output.to_csv("submission.csv", index=False)