In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Read In Data

In [None]:
train = pd.read_csv('../input/train.csv')
df = train.copy()
train.head()

In [None]:
sample = pd.read_csv('../input/sample_submission.csv')
sample.head()

# Understanding the Data

In [None]:
train.groupby('target').count()

In [None]:
train.groupby('target').count() / train.shape[0] * 100

This is a very imbalanced dataset, with only 6% of the records labelled as insincere.

In [None]:
list(train.loc[train.target == 1].question_text.head())

In [None]:
list(train.loc[train.target == 1].question_text.sample(10,random_state=504))

It looks to me like "insincere" is, from Quora's perspective, questions highly lacking in quality. Some of these look like something a kid would post to try to be funny, but others look like they could be sincere musings of the misinformed. For example, religious riots in India are a very complicated topic, and to assume that a question about any given religious sect being involved in one of them is a conspiracy theory question rather than a question about the complexities of Indian pluralism is not the best idea in my opinion. Also, in Buddhism there actually are questions around whether a female can attain Nirvana. Sexist? Yes. Question rooted in real people's real beliefs over the last few thousand years? Also yes. "Sincerity" for Quora's purposes is quite different from "sincerity" for, say, Reddit. This should be kept in mind in our feature selection process.

In [None]:
list(train.loc[train.target == 0].question_text.sample(15,random_state=405))

"My desk turned into a Viking ship after listening to Amon Amarth. What should I do with it?" is obviously an insincere question; I don't understand these labels at all. I think this data is poorly labelled.

# What makes a question "insincere"? Hypotheses...

_In general, not always_

* Simpler vocabulary
    * capture variety of words?
* Sweeping generalizations
    * occurence of an ethnic group and a political group in one question, for example. Not sure how to capture this.
* Poor grammar
    * bigrams and/or trigrams can capture this?
* Profanity and inflammatory words
    * collisions with urban dictionary or something like that?
* Emotional words?
    * no evidence for this yet, just a thought

# Text Feature Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from stop_words import get_stop_words
import re

stops = get_stop_words('english')
def clean_list(x):
    x = [i.strip() for i in x if i.strip() not in stops and i.strip() != '']
    return x
    
train.question_text = train.question_text.apply(lambda x: x.lower().strip())
train.question_text = train.question_text.apply(lambda x: re.sub(r'[?,\.!\"\']',' ', x))
train.question_text = train.question_text.apply(lambda x: x.split(' '))
train.question_text = train.question_text.apply(clean_list)

In [None]:
train.head()

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
train.question_text = train.question_text.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
train['preprocessed_text'] = train.question_text.apply(lambda x: ' '.join(x))
train.question_text = df.question_text
train.head()

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
train_pos = train.loc[train.target == 1].sample(5000,random_state=5094)
train_neg = train.loc[train.target == 0].sample(5000,random_state=5094)
test_pos = train.loc[(train.target == 1) & ~train.qid.isin(train_pos.qid)].sample(5000,random_state=5094)
test_neg = train.loc[(train.target == 0) & ~train.qid.isin(train_neg.qid)].sample(5000,random_state=5094)
train_subset = pd.concat([train_pos,train_neg,test_pos,test_neg])
train_subset['ner_data'] = train_subset.question_text.apply(lambda x: nlp(x).ents)

In [None]:
def get_ne_counts(tuples):
    nes = {}
    for tup in tuples:
        info = (tup.text, tup.label_)
        if info[1] in nes:
            nes[info[1]] += 1
        else:
            nes[info[1]] = 1
    return nes

train_subset['ne_types'] = train_subset.ner_data.apply(get_ne_counts)
ne_df = pd.DataFrame(list(train_subset['ne_types']),index=train_subset.index)
ne_df = ne_df.fillna(0)
train_subset = pd.concat([train_subset,ne_df],axis=1,join_axes=[train_subset.index])
train_subset.head()

In [None]:
cvec = CountVectorizer()
counts = cvec.fit_transform(train_subset.preprocessed_text)

In [None]:
counts = pd.DataFrame(counts.todense(),index=train_subset.index)
counts.columns = cvec.get_feature_names()
counts = counts[[c for c in counts.columns if counts[c].sum() >= 3]]
counts.head()

In [None]:
counts.to_csv('counts.csv',index=True,header=True)

In [None]:
feature_df = pd.concat([train_subset.iloc[:,6:],counts],axis=1)
feature_df['target'] = list(train_subset['target'])
feature_df.head()

# Train Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=40938)

X = feature_df[[c for c in feature_df.columns if c != 'target']]
y = feature_df.target
X_train = X.iloc[:10000,:]
X_test = X.iloc[10000:,:]
y_train = y.iloc[:10000]
y_test = y.iloc[10000:]

rfc.fit(X_train,y_train)
predictions = rfc.predict(X_test)

In [None]:
eval_df = pd.DataFrame({'actual': list(y_test),
                       'predicted': predictions})
eval_df['incorrect'] = eval_df['actual'] - eval_df['predicted']
eval_df.incorrect = eval_df.incorrect.apply(abs)
1 - eval_df.incorrect.sum() / eval_df.shape[0]

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rfc, n_iter=1,param_distributions = random_grid, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = sum(abs(predictions - test_labels)) / len(test_labels)
    accuracy = 100 - errors
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

In [None]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
predictions = best_random.predict(X_test)
eval_df = pd.DataFrame({'actual': list(y_test),
                       'predicted': predictions})
eval_df['incorrect'] = eval_df['actual'] - eval_df['predicted']
eval_df.incorrect = eval_df.incorrect.apply(abs)
1 - eval_df.incorrect.sum() / eval_df.shape[0]

In [None]:
precision = eval_df.loc[(eval_df.actual == 1) & (eval_df.predicted == 1)].shape[0] / \
eval_df.loc[(eval_df.predicted == 1)].shape[0]
precision

In [None]:
recall = eval_df.loc[(eval_df.actual == 1) & (eval_df.predicted == 1)].shape[0] / \
eval_df.loc[(eval_df.actual == 1)].shape[0]
recall

In [None]:
test_data = pd.read_csv('../input/test.csv')

# Make Predictions

In [None]:
test_data.head()

In [None]:
original = list(test_data['question_text'])
test_data.question_text = test_data.question_text.apply(lambda x: x.lower().strip())
test_data.question_text = test_data.question_text.apply(lambda x: re.sub(r'[?,\.!\"\']',' ', x))
test_data.question_text = test_data.question_text.apply(lambda x: x.split(' '))
test_data.question_text = test_data.question_text.apply(clean_list)

In [None]:
test_data.question_text = test_data.question_text.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
test_data['preprocessed_text'] = test_data.question_text.apply(lambda x: ' '.join(x))
test_data.question_text = original
test_data.head()

In [None]:
test_data['ner_data'] = test_data.question_text.apply(lambda x: nlp(x).ents)

In [None]:
test_data['ne_types'] = test_data.ner_data.apply(get_ne_counts)
ne_df = pd.DataFrame(list(test_data['ne_types']),index=test_data.index)
ne_df = ne_df.fillna(0)
test_data = pd.concat([test_data,ne_df],axis=1,join_axes=[test_data.index])
test_data.head()

In [None]:
counts = cvec.transform(test_data.preprocessed_text)

In [None]:
import gc

predictions = []
for i in range(5000,counts.shape[0]+4999,5000):
    gc.collect()
    subset = counts[i-5000:i].todense()
    subset = pd.DataFrame(subset,index=test_data.index[i-5000:i])
    subset.columns = cvec.get_feature_names()
    subset = subset[[c for c in subset.columns if c in X_train.columns]]
    feature_df = pd.concat([test_data.iloc[i-5000:i,6:],subset],axis=1)
    for col in X_train.columns:
        if col not in feature_df.columns:
            feature_df[col] = 0
    predictions += list(best_random.predict(feature_df))

In [None]:
test_data = pd.read_csv('../input/test.csv')
test_data['prediction'] = predictions

In [None]:
test_data.head()

In [None]:
test_data.groupby('prediction').count()

In [None]:
test_data = test_data[['qid','prediction']]
test_data.to_csv('submission.csv',index=None)