## Quora Insincere Questions Classification
#### Aleix Casellas Comas, Rubén Barco Terrones, Andreu Masdeu Ninot, Pablo Lázaro Terrones, Marco Gani Remane

### Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier


### ETL
#### Split data into train and test

In [2]:
dir_data = 'C:/Users/ruben/Documents/Máster Data Science/2º Cuatrimestre/Natural Languaje Processing/ML_for_NLP-master/project_1/quora/'
train_data = pd.read_csv(dir_data+'train.csv')
train_data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [3]:
X_train, X_test = model_selection.train_test_split(train_data, test_size=0.2, stratify=train_data['target'], random_state=123)

In [4]:
X_train.shape, X_test.shape

((1044897, 3), (261225, 3))

In [5]:
y_train =  X_train['target'].values
y_train.shape

(1044897,)

In [6]:
y_test = X_test['target'].values
y_test.shape

(261225,)

#### Delete the '?' at the end of each question and convert them to lowercase
We can skip this step. We can try both cases and see which of them is better.

In [7]:
x_train = X_train['question_text'].values
x_train, len(x_train)

(array(['How will the United States deal with record low unemployment?',
        'How long have the moderators on Quora been deciding that comments don\'t meet the "Be Nice" policy simply because they disagree with the political opinion, apparently?',
        'When does the learning curve in C++ go steep?', ...,
        'Is the discount rate of buying one share of a stock equal to the discount rate of buying ten shares?',
        'What is the best way to get a personal loan in Kenya?',
        'Do you think a piloted airplane could fly under the Deception Pass Bridge?'], dtype=object),
 1044897)

In [8]:
for i in range(len(x_train[:])):
    x_train[i] = x_train[i][:-1]   #.lower()
x_train, len(x_train)

(array(['How will the United States deal with record low unemployment',
        'How long have the moderators on Quora been deciding that comments don\'t meet the "Be Nice" policy simply because they disagree with the political opinion, apparently',
        'When does the learning curve in C++ go steep', ...,
        'Is the discount rate of buying one share of a stock equal to the discount rate of buying ten shares',
        'What is the best way to get a personal loan in Kenya',
        'Do you think a piloted airplane could fly under the Deception Pass Bridge'], dtype=object),
 1044897)

In [9]:
x_test = X_test['question_text'].values
x_test, len(x_test)

(array(['What is the minimum salary required for American Express Card?',
        'Can you make French fries only out of russet potatoes?',
        'How is the mark vs relative grade at NITC? What would be the pass mark for maths 1 usually? No one has answered this type of question on Quora . How much marks required for each grade?',
        ...,
        'What is the maximum size Transmission/Front sprocket that can be used for a Bajaj Avenger 220?',
        'How do liberals feel about Mark Dice absolutely destroying their ideology?',
        'In which direction does spiders make its web?'], dtype=object),
 261225)

In [10]:
for i in range(len(x_test[:])):
    x_test[i] = x_test[i][:-1]   #.lower()
x_test, len(x_test)

(array(['What is the minimum salary required for American Express Card',
        'Can you make French fries only out of russet potatoes',
        'How is the mark vs relative grade at NITC? What would be the pass mark for maths 1 usually? No one has answered this type of question on Quora . How much marks required for each grade',
        ...,
        'What is the maximum size Transmission/Front sprocket that can be used for a Bajaj Avenger 220',
        'How do liberals feel about Mark Dice absolutely destroying their ideology',
        'In which direction does spiders make its web'], dtype=object), 261225)

## Using CountVectroizer and 2-gram

#### Pipeline

In [11]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2)) # Try with 2-gram
x_train_vec = count_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]

In [12]:
min_k = n_features//2
max_k = 154278

In [13]:
feature_selector = SelectKBest(chi2)
logistic = sklearn.linear_model.LogisticRegression(C=0.1)
pipeline_1 = Pipeline([
    ("count_vectorizer", count_vectorizer),
    ("feature_selector", feature_selector),
    ("logisticregression", logistic),
])

possible_K = [int(x)-1 for x in np.linspace(min_k, max_k,10)]
parameteres = {'feature_selector__k':possible_K, 'logisticregression__C':[0.1,0.01,0.001]}

pipeline_2 = sklearn.model_selection.RandomizedSearchCV(pipeline_1,
                                                   param_distributions=parameteres, 
                                                   cv=3,
                                                   n_iter=2,
                                                   n_jobs=1)

In [14]:
%time
pipeline_2.fit(x_train, y_train)

Wall time: 0 ns


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=2, n_jobs=1,
          param_distributions={'feature_selector__k': [1368362, 1233463, 1098565, 963667, 828768, 693870, 558972, 424073, 289175, 154277], 'logisticregression__C': [0.1, 0.01, 0.001]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [15]:
acc_train = np.mean(pipeline_2.predict(x_train) == y_train)
acc_test = np.mean(pipeline_2.predict(x_test) == y_test)
print("acc_train={} acc_test={}".format(acc_train, acc_test))

acc_train=0.9649850655136344 acc_test=0.9553603215618719


## Using CountVectroizer and 3-gram

#### Pipeline

In [26]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,3)) # Try with 2-gram
x_train_vec = count_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]

In [27]:
min_k = n_features//2
max_k = 154278

In [28]:
feature_selector = SelectKBest(chi2)
logistic = sklearn.linear_model.LogisticRegression(C=0.1)
pipeline_1 = Pipeline([
    ("count_vectorizer", count_vectorizer),
    ("feature_selector", feature_selector),
    ("logisticregression", logistic),
])

possible_K = [int(x)-1 for x in np.linspace(min_k, max_k,10)]
parameteres = {'feature_selector__k':possible_K, 'logisticregression__C':[0.1,0.01,0.001]}

pipeline_2 = sklearn.model_selection.RandomizedSearchCV(pipeline_1,
                                                   param_distributions=parameteres, 
                                                   cv=3,
                                                   n_iter=2,
                                                   n_jobs=1)

In [29]:
%time
pipeline_2.fit(x_train, y_train)

Wall time: 0 ns


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=2, n_jobs=1,
          param_distributions={'feature_selector__k': [4526694, 4040869, 3555045, 3069221, 2583397, 2097573, 1611749, 1125925, 640101, 154277], 'logisticregression__C': [0.1, 0.01, 0.001]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [30]:
acc_train = np.mean(pipeline_2.predict(x_train) == y_train)
acc_test = np.mean(pipeline_2.predict(x_test) == y_test)
print("acc_train={} acc_test={}".format(acc_train, acc_test))

acc_train=0.9540567156380007 acc_test=0.9512068140491913


# Testing with the real test

In [19]:
dir_data = 'C:/Users/ruben/Documents/Máster Data Science/2º Cuatrimestre/Natural Languaje Processing/ML_for_NLP-master/project_1/quora/'
X_test_2 = pd.read_csv(dir_data+'test.csv')
train_data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [20]:
x_test_2 = X_test_2['question_text'].values
x_test_2, len(x_test_2)

(array([ 'Why do so many women become so rude and arrogant when they get just a little bit of wealth and power?',
        'When should I apply for RV college of engineering and BMS college of engineering? Should I wait for the COMEDK result or am I supposed to apply before the result?',
        'What is it really like to be a nurse practitioner?', ...,
        'Where I can find best friendship quotes in Telugu?',
        'What are the causes of refraction of light?',
        "Climate change is a worrying topic. How much time do we have left to find another planet? I mean, I don't think humans will survive on this earth for another 1000 years.. What do you think?"], dtype=object),
 375806)

In [21]:
for i in range(len(x_test_2[:])):
    x_test_2[i] = x_test_2[i][:-1]  #.lower()
x_test_2, len(x_test_2)

(array([ 'Why do so many women become so rude and arrogant when they get just a little bit of wealth and power',
        'When should I apply for RV college of engineering and BMS college of engineering? Should I wait for the COMEDK result or am I supposed to apply before the result',
        'What is it really like to be a nurse practitioner', ...,
        'Where I can find best friendship quotes in Telugu',
        'What are the causes of refraction of light',
        "Climate change is a worrying topic. How much time do we have left to find another planet? I mean, I don't think humans will survive on this earth for another 1000 years.. What do you think"], dtype=object),
 375806)

In [22]:
y_pred = pipeline_2.predict(x_test_2)

In [23]:
X_test_2['prediction'] = y_pred

In [24]:
X_test_2 = X_test_2.drop(columns="question_text")

In [25]:
X_test_2.to_csv('sample_submission.csv',index=False)

## CountVectroizer + ngrams + Simple Vecttorizer from notebook 02

In [None]:
import pandas as pd
import numpy as np
import sklearn
import nltk
import re
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from nltk.stem import WordNetLemmatizer, SnowballStemmer


In [None]:
class SimpleCountVectorizer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    
    def __init__(self, min_word_counts=1,
                 tokenize_function=nltk.word_tokenize,
                 dtype_featvec=np.int64,
                 lemmatizer=None,
                 stemmer=None,
                 doc_cleaner=None):
        
        self.min_word_counts = min_word_counts
        self.vocabulary = set()
        self.word_to_ind = {}
        self.tokenize = nltk.word_tokenize
        self.dtype_featvec = dtype_featvec
        self.lemmatizer = lemmatizer
        self.stemmer = stemmer
        self.doc_cleaner =  doc_cleaner

    def transform_word(self,word):
        word = word.lower()
        if self.lemmatizer:
            word = self.lemmatizer.lemmatize(word)
        elif self.stemmer:
            word = self.stemmer.stem(word)
        return word
    
    def transform_doc(self, doc):
        if isinstance(self.doc_cleaner,retype):
            doc = self.doc_cleaner.sub(" ", doc)
        elif isinstance(self.doc_cleaner,str):
            pattern = re.compile(self.doc_cleaner)
            doc = pattern.sub(" ", doc)
            
        return doc
    
    def fit(self, X):
        #Start coding
        assert self.vocabulary == set(), "self.vocabulary is not empty it has {} words".format(len(self.vocabulary))
        assert isinstance(X,list), "X is expected to be a list of documents"
        i = 0
        for x in X:
            x = self.transform_doc(x)
            #import pdb;pdb.set_trace()   ## To put a point break
            #Do something with the doc
            words = self.tokenize(x)
            for word in words:
                if word in self.vocabulary:
                    pass
                else:
                    self.vocabulary.add(word)
                    self.word_to_ind[word] = i
                    i+=1
                    
        # end coding
        
        self.n_features = len(self.vocabulary)
        return self
    
    def transform(self, X):
        
        #Start coding
        encoded_X = scipy.sparse.lil_matrix(len(X),self.n_features) #rows are documents and columns are words
        for m, doc in enumerate(X):
            doc = self.transform_doc(doc)
            #Do something with the doc
            words = self.tokenize(doc)
            for w in words:
                if w in self.vocabulary:
                    encoded_X[m,self.word_to_ind[w]] += 1
            
        # end coding
        
        return scipy.sparse.csr_matrix(encoded_X)
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        encoded_X = self.transform(X)
        return encoded_X
    
    def _words_in_vocab(self, X):
        
        if isinstance(X, str):
            return [w for w in self.tokenize(X) if w in self.vocabulary]
        
        X_words_in_vocab = []
        for sentence in X:
            X_words_in_vocab.append(self.tokenize(sentence))
            
        return X_words_in_vocab

In [None]:
dir_data = 'C:/Users/ruben/Documents/Máster Data Science/2º Cuatrimestre/Natural Languaje Processing/ML_for_NLP-master/project_1/quora/'
train_data = pd.read_csv(dir_data+'train.csv')
train_data.head(10)

In [None]:
X_train, X_test = model_selection.train_test_split(train_data, test_size=0.2, stratify=train_data['target'], random_state=123)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train =  X_train['target'].values
y_train.shape

In [None]:
y_test = X_test['target'].values
y_test.shape

#### Delete the '?' at the end of each question and convert them to lowercase
We can skip this step. We can try both cases and see which of them is better.

In [None]:
x_train = X_train['question_text'].values
x_train, len(x_train)

In [None]:
for i in range(len(x_train[:])):
    x_train[i] = x_train[i][:-1].lower()
x_train, len(x_train)

In [None]:
x_test = X_test['question_text'].values
x_test, len(x_test)

In [None]:
for i in range(len(x_test[:])):
    x_test[i] = x_test[i][:-1].lower()
x_test, len(x_test)

#### Pipeline

In [None]:
simple_count_vectorizer_stemmer = SimpleCountVectorizer(lemmatizer= None,
                                                        stemmer= SnowballStemmer('english'),
                                                        doc_cleaner=re.compile("[^a-zA-Z]"))

count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2)) # Try with 2-gram
x_train_vec = count_vectorizer.fit_transform(x_train)
n_features = x_train_vec.shape[1]

In [None]:
union = sklearn.pipeline.FeatureUnion([("simple_count_vectorizer_stemmer", simple_count_vectorizer_stemmer),
                                       ("count_vectorizer", count_vectorizer)])

In [None]:
union.transformer_list[0][1].doc_cleaner

In [None]:
min_k = n_features//2
max_k = 154278

In [None]:
feature_selector = SelectKBest(chi2, k = 154278)
logistic = sklearn.linear_model.LogisticRegression(C=0.1)
pipeline_1 = Pipeline([("union_vectorizers", union),
                      ("feature_selector", feature_selector),
                      ("logisticregression", logistic)])

#possible_K = [int(x)-1 for x in np.linspace(min_k, max_k,10)]
#parameteres = {'feature_selector__k':possible_K, 'logisticregression__C':[0.1,0.01,0.001]}

#pipeline_2 = sklearn.model_selection.RandomizedSearchCV(pipeline_1,
#                                                   param_distributions=parameteres, 
#                                                   cv=3,
#                                                   n_iter=2,
#                                                   n_jobs=1)

In [None]:
%time
pipeline_1.fit(x_train, y_train)

In [None]:
acc_train = np.mean(pipeline_2.predict(x_train) == y_train)
acc_test = np.mean(pipeline_2.predict(x_test) == y_test)
print("acc_train={} acc_test={}".format(acc_train, acc_test))