# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

# Data

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

## Splitting the data

In [None]:
X, y = train['text'], train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

In [None]:
len(vectorizer.get_feature_names())

# Models

In [None]:
def evaluate(y_true, y_predicted):
    acc = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    return acc, precision, recall, f1

## Linear Models

### [Ridge Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier)

In [None]:
from sklearn.linear_model import RidgeClassifier

clf_RC = RidgeClassifier().fit(X_train_vec, y_train)
print(clf_RC.score(X_test_vec, y_test))

y_pred = clf_RC.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

results_df = pd.DataFrame(data=[["Ridge Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

### [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

In [None]:
from sklearn.linear_model import LogisticRegression

clf_LR = LogisticRegression().fit(X_train_vec, y_train)
print(clf_LR.score(X_test_vec, y_test))

y_pred = clf_LR.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_LR = pd.DataFrame(data=[["Logistic Regressor", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_LR, ignore_index=True)

### [Logistic Regression Cross-validation Estimator](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV)

In [None]:
# from sklearn.linear_model import LogisticRegressionCV

# clf_LRCV = LogisticRegressionCV(cv=5, random_state=0).fit(X_train_vec, y_train)
# print(clf_LRCV.score(X_test_vec, y_test))

# y_pred = clf_LRCV.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_LRCV = pd.DataFrame(data=[["Logistic Regression Cross-val Estimator", *evaluate(y_test, y_pred)]],
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_LRCV, ignore_index=True)

### [Stochastic Gradient Descent Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier)

In [None]:
from sklearn.linear_model import SGDClassifier

clf_SGDC = SGDClassifier().fit(X_train_vec, y_train)
print(clf_SGDC.score(X_test_vec, y_test))

y_pred = clf_SGDC.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_SGDC = pd.DataFrame(data=[["Stochastic Gradient Descent Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_SGDC, ignore_index=True)

### [Passive Aggressive Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier)

In [None]:
# from sklearn.linear_model import PassiveAggressiveClassifier

# clf_PAC = PassiveAggressiveClassifier().fit(X_train_vec, y_train)
# print(clf_PAC.score(X_test_vec, y_test))

# y_pred = clf_PAC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_PAC = pd.DataFrame(data=[["Passive Aggressive Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_PAC, ignore_index=True)

### [Perceptron](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron)

In [None]:
# from sklearn.linear_model import Perceptron

# clf_Per = Perceptron().fit(X_train_vec, y_train)
# print(clf_Per.score(X_test_vec, y_test))

# y_pred = clf_Per.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_Per = pd.DataFrame(data=[["Perceptron", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_Per, ignore_index=True)

## Support Vector Machines

### [C-Support Vector Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)

In [None]:
# from sklearn.svm import SVC

# clf_SVC = SVC().fit(X_train_vec, y_train)
# print(clf_SVC.score(X_test_vec, y_test))

# y_pred = clf_SVC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_SVC = pd.DataFrame(data=[["Support Vector Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_SVC, ignore_index=True)

### [Linear Support Vector Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC)

In [None]:
from sklearn.svm import LinearSVC

clf_LSVC = LinearSVC().fit(X_train_vec, y_train)
print(clf_LSVC.score(X_test_vec, y_test))

y_pred = clf_LSVC.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_LSVC = pd.DataFrame(data=[["Linear Support Vector Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_LSVC, ignore_index=True)

## Naive Bayes Classifiers

### [Gaussian Naive Bayes](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB)

In [None]:
# from sklearn.naive_bayes import GaussianNB

# clf_GNB = GaussianNB().fit(X_train_vec, y_train)
# print(clf_GNB.score(X_test_vec, y_test))

# y_pred = clf_GNB.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_GNB = pd.DataFrame(data=[["Gaussian Naive Bayes Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_GNB, ignore_index=True)

### [Multinomial Naive Bayes](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB)

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf_MNB = MultinomialNB().fit(X_train_vec, y_train)
print(clf_MNB.score(X_test_vec, y_test))

y_pred = clf_MNB.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_MNB = pd.DataFrame(data=[["Multinomial Naive Bayes Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_MNB, ignore_index=True)

### [Complement Naive Bayes Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB)

In [None]:
from sklearn.naive_bayes import ComplementNB

clf_CNB = ComplementNB().fit(X_train_vec, y_train)
print(clf_CNB.score(X_test_vec, y_test))

y_pred = clf_CNB.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_CNB = pd.DataFrame(data=[["Complement Naive Bayes Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_CNB, ignore_index=True)

### [Bernoulli Naive Bayes Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB)

In [None]:
# from sklearn.naive_bayes import BernoulliNB

# clf_BNB = BernoulliNB().fit(X_train_vec, y_train)
# print(clf_BNB.score(X_test_vec, y_test))

# y_pred = clf_BNB.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_BNB = pd.DataFrame(data=[["Bernoulli Naive Bayes Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_BNB, ignore_index=True)

## [Decision Tree Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# clf_DTC = DecisionTreeClassifier().fit(X_train_vec, y_train)
# print(clf_DTC.score(X_test_vec, y_test))

# y_pred = clf_DTC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_DTC = pd.DataFrame(data=[["Decision Tree Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_DTC, ignore_index=True)

## Ensemble Methods

### [Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# clf_RFC = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train_vec, y_train)
# print(clf_RFC.score(X_test_vec, y_test))

# y_pred = clf_RFC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_RFC = pd.DataFrame(data=[["Random Forest Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_RFC, ignore_index=True)

### [Extra Trees Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier)

In [None]:
# from sklearn.ensemble import ExtraTreesClassifier

# clf_ETC = ExtraTreesClassifier(n_estimators=100, random_state=0).fit(X_train_vec, y_train)
# print(clf_ETC.score(X_test_vec, y_test))

# y_pred = clf_ETC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_ETC = pd.DataFrame(data=[["Extra Trees Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_ETC, ignore_index=True)

### [Ada Boost Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier)

In [None]:
# from sklearn.ensemble import AdaBoostClassifier

# clf_ABC = AdaBoostClassifier(n_estimators=100, random_state=0).fit(X_train_vec, y_train)
# print(clf_ABC.score(X_test_vec, y_test))

# y_pred = clf_ABC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_ABC = pd.DataFrame(data=[["Ada Boost Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_ABC, ignore_index=True)

### [Gradient Boosting Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier)

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# clf_GBC = GradientBoostingClassifier(n_estimators=100).fit(X_train_vec, y_train)
# print(clf_GBC.score(X_test_vec, y_test))

# y_pred = clf_GBC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_GBC = pd.DataFrame(data=[["Gradient Boosting Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_GBC, ignore_index=True)

### [Hist Gradient Boosting Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html#sklearn.ensemble.HistGradientBoostingClassifier)

In [None]:
# from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.ensemble import HistGradientBoostingClassifier

# clf_HGBC = HistGradientBoostingClassifier().fit(X_train_vec, y_train)
# print(clf_HGBC.score(X_test_vec, y_test))

# y_pred = clf_HGBC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_HGBC = pd.DataFrame(data=[["Hist Gradient Boosting Classifier", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_HGBC, ignore_index=True)

### [Bagging Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier)

In [None]:
results_df # to select the best classifier so far to use as base estimator for this classifier

Complement Naive Bayes Classifier has the best performance so far.

In [None]:
from sklearn.ensemble import BaggingClassifier

clf_BC = BaggingClassifier(base_estimator=ComplementNB(),
                           n_estimators=10, random_state=0).fit(X_train_vec, y_train)
print(clf_BC.score(X_test_vec, y_test))

y_pred = clf_BC.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_BC = pd.DataFrame(data=[["Bagging Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_BC, ignore_index=True)

## [Multilayer Perceptron](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)

In [None]:
# from sklearn.neural_network import MLPClassifier

# clf_MLPC = MLPClassifier(learning_rate='adaptive', random_state=1, max_iter=300).fit(X_train_vec, y_train)
# print(clf_MLPC.score(X_test_vec, y_test))

# y_pred = clf_MLPC.predict(X_test_vec)
# cm = metrics.confusion_matrix(y_test, y_pred)
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.show()

# df_MLPC = pd.DataFrame(data=[["Multi-Layer Perceptron", *evaluate(y_test, y_pred)]], 
#                           columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# results_df = results_df.append(df_MLPC, ignore_index=True)

## Model Comparison

In [None]:
results_df.set_index('Model', inplace=True)
results_df['Accuracy'].plot(kind='barh', figsize=(12, 8))

In [None]:
results_df['F1 Score'].plot(kind='barh', figsize=(12, 8))

In [None]:
results_df

# Creating Submission File

In [None]:
test_vec = vectorizer.transform(test['text']).toarray()
predictions = clf_CNB.predict(test_vec)

In [None]:
submission = pd.DataFrame(predictions, columns=['target'])
submission['id'] = test['id']
submission.set_index('id', inplace=True)

submission.to_csv('submission.csv')

In [None]:
len(predictions)

In [None]:
submission.head()