# Goal

Our goal here is to create a baseline model.

In [None]:
import pandas as pd 
import seaborn as sns

In [None]:
df_sample_sub = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/sample_submission.csv')
df_train = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
df_test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

In [None]:
df_sample_sub.head(5)

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
# Checking for null values
df_train.info()

In [None]:
# Checking if the problem is balanced or unbalanced --> unbalanced
df_train.target.value_counts()

In [None]:
sns.countplot(data=df_train, x='target')

In [None]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk_stopwords = stopwords.words('english')

wordnet_lemmatizer = WordNetLemmatizer()

def lemSentence(sentence):
    token_words = word_tokenize(sentence)
    lem_sentence = []
    for word in token_words:
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        lem_sentence.append(" ")
    return "".join(lem_sentence)

def clean(message, lem=True):
    # Remove ponctuation
    message = message.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    message = message.translate(str.maketrans('', '', string.digits))
    
    # Remove stop words
    message = [word for word in word_tokenize(message) if not word.lower() in nltk_stopwords]
    message = ' '.join(message)
    
    # Lemmatization (root of the word)
    if lem:
        message = lemSentence(message)
    
    return message

In [None]:
# Cleaning the questions (Really slow process, to be changed !)
df_train['question_text_cleaned'] = df_train.question_text.apply(lambda x: clean(x, True))

In [None]:
df_train.head(5)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
model = LogisticRegression(C=1, random_state=0)

vectorize_model_pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('model', model)
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train['question_text_cleaned'], df_train['target'], test_size=0.3)
vectorize_model_pipeline.fit(X_train, y_train)

In [None]:
predictions = vectorize_model_pipeline.predict(X_test)

In [None]:
print('Accuracy :', accuracy_score(y_test, predictions))
print('F1 score :', accuracy_score(y_test, predictions))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

In [None]:
# Cleaning the questions (Really slow process, to be changed !)
df_test['question_text_cleaned'] = df_test.question_text.apply(lambda x: clean(x, True))

In [None]:
df_test['prediction'] = vectorize_model_pipeline.predict(df_test['question_text_cleaned'])

In [None]:
df_final = df_test[['qid','prediction']]
df_final.set_index('qid', inplace=True)

In [None]:
df_final.head(5)

In [None]:
df_final.to_csv('submission.csv')