## Quora Insincere Questions Classification
#### Aleix Casellas Comas, Rubén Barco Terrones, Andreu Masdeu Ninot, Pablo Lázaro Terrones, Marco Gani Remane

### Libraries

In [19]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

### ETL
#### Split data into train and test

In [2]:
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
train_data = pd.read_csv(dir_data+'train.csv')
train_data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [3]:
X_train, X_test = model_selection.train_test_split(train_data, test_size=0.2, stratify=train_data['target'], random_state=123)

In [23]:
X_train.shape, X_test.shape

((1044897, 3), (261225, 3))

In [34]:
y_train =  X_train['target'].values
y_train.shape

(1044897,)

In [35]:
y_test = X_test['target'].values
y_test.shape

(261225,)

#### Delete the '?' at the end of each question and convert them to lowercase
We can skip this step. We can try both cases and see which of them is better.

In [24]:
x_train = X_train['question_text'].values
x_train, len(x_train)

(array(['how will the united states deal with record low unemployment',
        'how long have the moderators on quora been deciding that comments don\'t meet the "be nice" policy simply because they disagree with the political opinion, apparently',
        'when does the learning curve in c++ go steep', ...,
        'is the discount rate of buying one share of a stock equal to the discount rate of buying ten shares',
        'what is the best way to get a personal loan in kenya',
        'do you think a piloted airplane could fly under the deception pass bridge'],
       dtype=object), 1044897)

In [25]:
for i in range(len(x_train[:])):
    x_train[i] = x_train[i][:-1].lower()
x_train, len(x_train)

(array(['how will the united states deal with record low unemploymen',
        'how long have the moderators on quora been deciding that comments don\'t meet the "be nice" policy simply because they disagree with the political opinion, apparentl',
        'when does the learning curve in c++ go stee', ...,
        'is the discount rate of buying one share of a stock equal to the discount rate of buying ten share',
        'what is the best way to get a personal loan in keny',
        'do you think a piloted airplane could fly under the deception pass bridg'],
       dtype=object), 1044897)

In [36]:
x_test = X_test['question_text'].values
x_test, len(x_test)

(array(['what is the minimum salary required for american express car',
        'can you make french fries only out of russet potatoe',
        'how is the mark vs relative grade at nitc? what would be the pass mark for maths 1 usually? no one has answered this type of question on quora . how much marks required for each grad',
        ...,
        'what is the maximum size transmission/front sprocket that can be used for a bajaj avenger 22',
        'how do liberals feel about mark dice absolutely destroying their ideolog',
        'in which direction does spiders make its we'], dtype=object), 261225)

In [37]:
for i in range(len(x_test[:])):
    x_test[i] = x_test[i][:-1].lower()
x_test, len(x_test)

(array(['what is the minimum salary required for american express ca',
        'can you make french fries only out of russet potato',
        'how is the mark vs relative grade at nitc? what would be the pass mark for maths 1 usually? no one has answered this type of question on quora . how much marks required for each gra',
        ...,
        'what is the maximum size transmission/front sprocket that can be used for a bajaj avenger 2',
        'how do liberals feel about mark dice absolutely destroying their ideolo',
        'in which direction does spiders make its w'], dtype=object), 261225)

#### Pipeline

In [28]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer() # Try with 2-gram
x_train_vec = count_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]
min_k = n_features//2
# max_k = n_features
max_k = 154278

In [30]:
feature_selector = SelectKBest(chi2)
logistic = sklearn.linear_model.LogisticRegression(C=0.1)
pipeline_1 = Pipeline([
    ("count_vectorizer", count_vectorizer),
    ("feature_selector", feature_selector),
    ("logisticregression", logistic),
])

possible_K = [int(x)-1 for x in np.linspace(min_k, max_k,10)]
parameteres = {'feature_selector__k':possible_K, 'logisticregression__C':[0.1,0.01,0.001]}

pipeline_2 = sklearn.model_selection.RandomizedSearchCV(pipeline_1,
                                                   param_distributions=parameteres, 
                                                   cv=3,
                                                   n_iter=2,
                                                   n_jobs=1)

In [38]:
%time
pipeline_2.fit(x_train, y_train)

Wall time: 0 ns




RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
 ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=2, n_jobs=1,
          param_distributions={'feature_selector__k': [98091, 108990, 119889, 130788, 141687, 152587, 163486, 174385, 185284, 196184], 'logisticregression__C': [0.1, 0.01, 0.001]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [40]:
acc_train = np.mean(pipeline_2.predict(x_train) == y_train)
acc_test = np.mean(pipeline_2.predict(x_test) == y_test)
print("acc_train={} acc_test={}".format(acc_train, acc_test))

acc_train=0.9494543481319211 acc_test=0.9484122882572495


# Testing with the real test

In [42]:
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
X_test_2 = pd.read_csv(dir_data+'test.csv')
train_data.head(10)

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?
5,000101884c19f3515c1a,How do you train a pigeon to send messages?
6,00010f62537781f44a47,What is the currency in Langkawi?
7,00012afbd27452239059,"What is the future for Pandora, can the busine..."
8,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
9,000156468431f09b3cae,How much does a tutor earn in Bangalore?


In [46]:
x_test_2 = X_test_2['question_text'].values
x_test_2, len(x_test_2)

(array(['Why do so many women become so rude and arrogant when they get just a little bit of wealth and power?',
        'When should I apply for RV college of engineering and BMS college of engineering? Should I wait for the COMEDK result or am I supposed to apply before the result?',
        'What is it really like to be a nurse practitioner?', ...,
        'Where I can find best friendship quotes in Telugu?',
        'What are the causes of refraction of light?',
        "Climate change is a worrying topic. How much time do we have left to find another planet? I mean, I don't think humans will survive on this earth for another 1000 years.. What do you think?"],
       dtype=object), 375806)

In [49]:
for i in range(len(x_test_2[:])):
    x_test_2[i] = x_test_2[i][:-1].lower()
x_test_2, len(x_test_2)

(array(['why do so many women become so rude and arrogant when they get just a little bit of wealth and power',
        'when should i apply for rv college of engineering and bms college of engineering? should i wait for the comedk result or am i supposed to apply before the result',
        'what is it really like to be a nurse practitioner', ...,
        'where i can find best friendship quotes in telugu',
        'what are the causes of refraction of light',
        "climate change is a worrying topic. how much time do we have left to find another planet? i mean, i don't think humans will survive on this earth for another 1000 years.. what do you think"],
       dtype=object), 375806)

In [50]:
y_pred = pipeline_2.predict(x_test_2)

In [51]:
X_test_2['prediction'] = y_pred

In [56]:
X_test_2 = X_test_2.drop(columns="question_text")

In [58]:
X_test_2.to_csv('sample_submission.csv',index=False)