
# <h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > Quora Insincere Questions Classification  </h1>
### We used 4 algorithms Classifier


* SGD Classifier
* Random Forest Classifier
* XGB Classifier
* KNeighbors Classifier


<img src="https://datawhatnow.com/wp-content/uploads/2017/04/word_cloud.jpg" width="800px">



### File descriptions
* train.csv - the training set
* test.csv - the test set
* sample_submission.csv - A sample submission in the correct format
* enbeddings/ - (see below)


### Data fields
* qid - unique question identifier
* question_text - Quora question text
* target - a question labeled "insincere" has a value of 1, otherwise 0



### Dataset Link


##### [Here](https://www.kaggle.com/c/quora-insincere-questions-classification/data)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')
sub = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/sample_submission.csv')

In [None]:
display(train.head())
display(test.head())
display(sub.head())

In [None]:
display(train.info())
display(test.info())
display(sub.info())

In [None]:
import pandas_profiling as pp
pp.ProfileReport(train)

In [None]:
import pandas_profiling as pp
pp.ProfileReport(test)

In [None]:
train.target.value_counts()

In [None]:
# Checking if it is balanced or unbalanced 
train['target'].value_counts()

train['target'].value_counts() * 100 / len(train)


sns.countplot(x='target', data=train, palette='viridis')

In [None]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk_stopwords = stopwords.words('english')

wordnet_lemmatizer = WordNetLemmatizer()

def lemSentence(sentence):
    token_words = word_tokenize(sentence)
    lem_sentence = []
    for word in token_words:
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        lem_sentence.append(" ")
    return "".join(lem_sentence)

def clean(message, lem=True):
    # Remove ponctuation
    message = message.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    message = message.translate(str.maketrans('', '', string.digits))
    
    # Remove stop words
    message = [word for word in word_tokenize(message) if not word.lower() in nltk_stopwords]
    message = ' '.join(message)
    
    # Lemmatization (root of the word)
    if lem:
        message = lemSentence(message)
    
    return message

In [None]:
train['question_text_cleaned'] = train.question_text.apply(lambda x: clean(x, True))

In [None]:
print(train.head())
print(train.info())
print(train.describe().T)

In [None]:
from sklearn.model_selection import train_test_split
# split  data into training and testing sets of 50:50 ratio
# 50% of test size selected
# random_state is random seed
X_train, X_test, y_train, y_test = train_test_split(train['question_text_cleaned'], train['target'], test_size=0.50, random_state=1)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
model1 = SGDClassifier(random_state=0)

vectorize_model_pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('model', model1)])
vectorize_model_pipeline.fit(X_train, y_train)
predictions1 = vectorize_model_pipeline.predict(X_test)
print('Accuracy :', accuracy_score(y_test, predictions1))
print('F1 score :', accuracy_score(y_test, predictions1))

In [None]:
from sklearn.ensemble import RandomForestClassifier
count_vectorizer = CountVectorizer()
model2 = RandomForestClassifier()

vectorize_model_pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('model', model2)])
vectorize_model_pipeline.fit(X_train, y_train)
predictions2 = vectorize_model_pipeline.predict(X_test)

print('Accuracy :', accuracy_score(y_test, predictions2))
print('F1 score :', accuracy_score(y_test, predictions2))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
count_vectorizer = CountVectorizer()
model3 = KNeighborsClassifier()

vectorize_model_pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('model', model3)])
vectorize_model_pipeline.fit(X_train, y_train)
predictions3 = vectorize_model_pipeline.predict(X_test)

print('Accuracy :', accuracy_score(y_test, predictions3))
print('F1 score :', accuracy_score(y_test, predictions3))

In [None]:
from xgboost import XGBClassifier
count_vectorizer = CountVectorizer()
model4 = XGBClassifier()

vectorize_model_pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('model', model4)])
vectorize_model_pipeline.fit(X_train, y_train)
predictions4 = vectorize_model_pipeline.predict(X_test)

print('Accuracy :', accuracy_score(y_test, predictions4))
print('F1 score :', accuracy_score(y_test, predictions4))

In [None]:
models = pd.DataFrame({
    'Model': ['SGD Classifier','Random Forest Classifier',
              'K Neighbors Classifier','XGB Classifier'],

    'F1 score ': [accuracy_score(y_test, predictions1)*100,
              accuracy_score(y_test, predictions2)*100,
              accuracy_score(y_test, predictions3)*100, 
              accuracy_score(y_test, predictions4)*100]})

models.sort_values(by='F1 score ', ascending=True)

In [None]:
test['question_text_cleaned'] = test.question_text.apply(lambda x: clean(x, True))

In [None]:
test['prediction'] = vectorize_model_pipeline.predict(test['question_text_cleaned'])

In [None]:
final = test[['qid','prediction']]
final.set_index('qid', inplace=True)
final.head()

In [None]:
final.to_csv('submission.csv')