# Text Classification with SKLearn

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier

### Import datasets

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train.head()

### Text Vectorization

In [None]:
vectorization_strategies = ["tf-idf", "hasing"]
vectorization_strategy = vectorization_strategies[0]
if vectorization_strategy == vectorization_strategies[0]:
    vectorizer = TfidfVectorizer()
if vectorization_strategy == vectorization_strategies[1]:
    vectorizer = HashingVectorizer(n_features=2**14, alternate_sign=False)
    #vectorizer = HashingVectorizer(alternate_sign=False)
train_vec = vectorizer.fit_transform(train["text"])
test_vec = vectorizer.transform(test["text"])

### Train Validation Split

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_vec, train["target"], random_state=42)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

### Modelling

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
def run_experiment(model, x_train, y_train, x_val, y_val):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    acc = accuracy_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    print("Validation Accuracy:%.2f"%(acc))
    sns.heatmap(cm, annot=True)
    plt.title("Confusion Matrix")
    plt.show()
    metric = dict()
    metric["accuracy"] = acc
    metric["confusion_matrix"] = cm
    return model, metric

In [None]:
models = []
metrics = []
base_models = [
    # Linear Models
    LogisticRegression(),
    RidgeClassifier(),
    SGDClassifier(),
    # Naive Bayes
    PassiveAggressiveClassifier(),
    BernoulliNB(),
    ComplementNB(),
    MultinomialNB(),
    # Tree
    ExtraTreeClassifier(),
    DecisionTreeClassifier(),
    # Ensemble
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    # Neurlal Network
    MLPClassifier(), 
    # Gaussian Process
    #GaussianProcessClassifier(kernel=RBF(1.0))
]
for model in base_models:
    print("Classification with %s"%(model))
    model, metric = run_experiment(
        model, 
        x_train, y_train, x_val, y_val
    )
    models.append(model)
    metrics.append(metric)

### Ensemble with Voting Classifier

In [None]:
estimators = [(str(model), model) for model in [LogisticRegression(), BernoulliNB(), ComplementNB(), MultinomialNB()]]
# If ‘hard’, uses predicted class labels for majority rule voting. 
# If ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities.
voting_classifier = VotingClassifier(estimators=estimators, voting='soft')
voting_classifier, _ = run_experiment(voting_classifier, x_train, y_train, x_val, y_val)

## Ensemble with StackingClassifier

In [None]:
stacking_classifier = StackingClassifier(estimators=estimators)
stacking_classifier, _ = run_experiment(stacking_classifier, x_train, y_train, x_val, y_val)

### Ensemble with Bagging Classfier

In [None]:
from sklearn.svm import SVC
bagging_classfier = BaggingClassifier(base_estimator=SVC())
bagging_classfier, _ = run_experiment(bagging_classfier, x_train, y_train, x_val, y_val)

## Submission

In [None]:
submit_types = ["voting", "stacking", "bagging", "ensemble"]
submit_type = submit_types[3]
if submit_type == submit_types[0]:
    result = voting_classifier.predict(test_vec)
if submit_type == submit_types[1]:
    result = stacking_classifier.predict(test_vec)
if submit_type == submit_types[2]:
    result = bagging_classfier.predict(test_vec)
if submit_type == submit_types[3]:
    result = np.array(np.mean([classifier.predict_proba(test_vec)[:, 1] for classifier in [voting_classifier, stacking_classifier, bagging_classfier]], axis=0) > 0.5, dtype=int)
submission = pd.DataFrame({"id": test["id"], "target": result})
submission.to_csv("submission.csv", index=False)
submission.head()