## Quora Insincere Questions Classification
#### Aleix Casellas Comas, Rubén Barco Terrones, Andreu Masdeu Ninot, Pablo Lázaro Terrones, Marco Gani Remane

### Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

### ETL
#### Split data into train and test

In [10]:
#dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
#train_data = pd.read_csv(dir_data+'train.csv')
train_data = pd.read_csv('train.csv')
train_data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [11]:
X_train, X_test = model_selection.train_test_split(train_data, test_size=0.2, stratify=train_data['target'], random_state=123)

In [12]:
X_train.shape, X_test.shape

((1044897, 3), (261225, 3))

In [13]:
y_train =  X_train['target'].values
y_train.shape

(1044897,)

In [14]:
y_test = X_test['target'].values
y_test.shape

(261225,)

In [15]:
x_train = X_train['question_text'].values
x_train, len(x_train)

(array(['How will the United States deal with record low unemployment?',
        'How long have the moderators on Quora been deciding that comments don\'t meet the "Be Nice" policy simply because they disagree with the political opinion, apparently?',
        'When does the learning curve in C++ go steep?', ...,
        'Is the discount rate of buying one share of a stock equal to the discount rate of buying ten shares?',
        'What is the best way to get a personal loan in Kenya?',
        'Do you think a piloted airplane could fly under the Deception Pass Bridge?'],
       dtype=object), 1044897)

In [23]:
x_test = X_test['question_text'].values
x_test, len(x_test)

(array(['What is the minimum salary required for American Express Card?',
        'Can you make French fries only out of russet potatoes?',
        'How is the mark vs relative grade at NITC? What would be the pass mark for maths 1 usually? No one has answered this type of question on Quora . How much marks required for each grade?',
        ...,
        'What is the maximum size Transmission/Front sprocket that can be used for a Bajaj Avenger 220?',
        'How do liberals feel about Mark Dice absolutely destroying their ideology?',
        'In which direction does spiders make its web?'], dtype=object),
 261225)

#### Pipeline

##### Countvectorizer

In [16]:
#Change the parameters inside the CountVectorizer, for example removing ngram_range or adding stop-words
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2))
x_train_vec = count_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]
min_k = n_features//2
# max_k = n_features
max_k = 154278 

Lemma

In [17]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(tokenizer=LemmaTokenizer(),ngram_range=(1,2)) 
x_train_vec = count_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]
min_k = n_features//2
# max_k = n_features
max_k = 154278 

Stem

In [16]:
from nltk.stem.porter import *

stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))


count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer=stemmed_words,ngram_range=(1,2), stop_words='english')
x_train_vec = count_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]
min_k = n_features//2
# max_k = n_features
max_k = 100000

##### Tfidfvectorizer

Normal

In [55]:
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,2)) 
x_train_vec = tfidf_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]
min_k = n_features//2
# max_k = n_features
max_k = 140280

Stem

In [16]:
from nltk.stem.porter import *

stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(analyzer=stemmed_words,ngram_range=(1,2), stop_words='english') # Try with 2-gram
x_train_vec = tfidf_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]
min_k = n_features//2
# max_k = n_features
max_k = 100000

Lemma

In [11]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=LemmaTokenizer(),ngram_range=(1,2), stop_words='english') # Try with 2-gram
x_train_vec = tfidf_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]
min_k = n_features//2
# max_k = n_features
max_k = 100000

In [19]:
feature_selector = SelectKBest(chi2)
# logistic = sklearn.linear_model.LogisticRegression(C=0.1) #Comment one line or the other depending on the classifier
mnb = MultinomialNB()                                       #you want to use
pipeline_1 = Pipeline([
    ("count_vectorizer", count_vectorizer),
    ("feature_selector", feature_selector),
    ("Naive_Bayes", mnb),
#     ("logistic_regression", logistic)
])

possible_K = [int(x)-1 for x in np.linspace(min_k, max_k,10)]
parameteres = {'feature_selector__k':possible_K}#, 'Naive_Bayes__alpha':[0,0.5,1], 'logisticregression__C':[0.1,0.01,0.001]} #You can add these parameters for CV

pipeline_2 = sklearn.model_selection.RandomizedSearchCV(pipeline_1,
                                                   param_distributions=parameteres, 
                                                   cv=3,
                                                   n_iter=2,
                                                   n_jobs=1)

In [20]:
%%time
pipeline_2.fit(x_train, y_train)

Wall time: 1h 4min 21s


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=2, n_jobs=1,
          param_distributions={'feature_selector__k': [1329389, 1198821, 1068253, 937685, 807117, 676549, 545981, 415413, 284845, 154277]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [21]:
pipeline_2.best_params_

{'feature_selector__k': 1198821}

In [24]:
y_train_pred = pipeline_2.predict(x_train)
y_test_pred = pipeline_2.predict(x_test)
acc_train = np.mean(y_train_pred == y_train)
acc_test = np.mean(y_test_pred == y_test)
print("acc_train={} acc_test={}".format(acc_train, acc_test))
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)
print("f1_train={} f1_test={}".format(f1_train, f1_test))

acc_train=0.9657554763770975 acc_test=0.9555555555555556
f1_train=0.6673051175245462 f1_test=0.554489639293937


# Testing with the real test

In [78]:
# dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
# X_test_2 = pd.read_csv(dir_data+'test.csv')
X_test_2 = pd.read_csv('test.csv')
train_data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [79]:
x_test_2 = X_test_2['question_text'].values
x_test_2, len(x_test_2)

(array(['Why do so many women become so rude and arrogant when they get just a little bit of wealth and power?',
        'When should I apply for RV college of engineering and BMS college of engineering? Should I wait for the COMEDK result or am I supposed to apply before the result?',
        'What is it really like to be a nurse practitioner?', ...,
        'Where I can find best friendship quotes in Telugu?',
        'What are the causes of refraction of light?',
        "Climate change is a worrying topic. How much time do we have left to find another planet? I mean, I don't think humans will survive on this earth for another 1000 years.. What do you think?"],
       dtype=object), 375806)

In [80]:
for i in range(len(x_test_2[:])):
    x_test_2[i] = x_test_2[i][:-1].lower()
x_test_2, len(x_test_2)

(array(['why do so many women become so rude and arrogant when they get just a little bit of wealth and power',
        'when should i apply for rv college of engineering and bms college of engineering? should i wait for the comedk result or am i supposed to apply before the result',
        'what is it really like to be a nurse practitioner', ...,
        'where i can find best friendship quotes in telugu',
        'what are the causes of refraction of light',
        "climate change is a worrying topic. how much time do we have left to find another planet? i mean, i don't think humans will survive on this earth for another 1000 years.. what do you think"],
       dtype=object), 375806)

In [81]:
y_pred = pipeline_2.predict(x_test_2)

In [82]:
X_test_2['prediction'] = y_pred

In [83]:
X_test_2 = X_test_2.drop(columns="question_text")

In [84]:
X_test_2.to_csv('sample_submission.csv',index=False)