In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
data.shape

In [None]:
data.head()

In [None]:
data['target'].nunique()

In [None]:
data['target'].unique()

In [None]:
data['target'].value_counts() / data.shape[0] * 100

In [None]:
data.isna().sum()

In [None]:
from gensim.parsing.porter import PorterStemmer
from gensim.parsing import remove_stopwords
stemmer = PorterStemmer()

# Text cleaning
docs = data['question_text'].str.lower() # lower case conversion
docs = docs.str.replace('[^a-z\s]', '') # removal of special characters
docs = pd.Series(stemmer.stem_documents(docs)) # Identifying root form of the word
docs = docs.apply(remove_stopwords) # Removing stop words
docs.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

train_x, validate_x, train_y, validate_y = train_test_split(docs,
                                                    data['target'],
                                                    test_size=0.2,
                                                    random_state=1)

vectorizer = CountVectorizer(min_df=20).fit(train_x)
vocab = vectorizer.get_feature_names()
train_dtm = vectorizer.transform(train_x)
validate_dtm = vectorizer.transform(validate_x)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
model_nb = MultinomialNB().fit(train_dtm, train_y)

validate_y_pred = model_nb.predict(validate_dtm)
print(accuracy_score(validate_y, validate_y_pred))
print(f1_score(validate_y, validate_y_pred, pos_label=1))

In [None]:
test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')
test_docs = test['question_text'].str.lower() # lower case conversion
test_docs = test_docs.str.replace('[^a-z\s]', '') # removal of special characters
test_docs = pd.Series(stemmer.stem_documents(test_docs)) # Identifying root form of the word
test_docs = test_docs.apply(remove_stopwords) # Removing stop words
test_docs.head()

In [None]:
test.head()

In [None]:
test_dtm = vectorizer.transform(test_docs)
test_y_pred = model_nb.predict(test_dtm)
test['prediction'] = test_y_pred

In [None]:
sample = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/sample_submission.csv')
sample.head()

In [None]:
test[['qid', 'prediction']].to_csv('submission.csv', index=False)