In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,  train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix



In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv('/kaggle/input/dutch-news-headlines/headlines_dataset.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
# Checking for duplicates
len(df['headline'].unique())

In [None]:
df = df.drop_duplicates(subset=['headline'])

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="is_sarcastic", data=df)

In [None]:

df_viz = df 
df_non_hot = df_viz.iloc[:,4:7]
df_non_hot['non_hot'] = df_non_hot.idxmax(1) 
non_hot = df_non_hot['non_hot'] 
df_viz2 = df_viz.join(non_hot) 

non_hot = df_viz2['non_hot']
df_viz2.groupby('is_sarcastic').non_hot.value_counts().unstack(0).plot.barh()

In [None]:
X = np.array(df['headline'])
y = np.array(df['is_sarcastic']) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, 
                                                    random_state=42, stratify=y)

In [None]:
vect = CountVectorizer()  
sgd = SGDClassifier() 

headline_pipe= make_pipeline(vect, sgd)
headline_pipe

In [None]:
# Fitting the data to the pipeline
headline_pipe.fit(X_train, y_train)
# Comparing test with train 
predictions = headline_pipe.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
vect = CountVectorizer()  
clf = SGDClassifier() 

pipe2 = make_pipeline(vect, clf)
pipe2

In [None]:
# Establishing the parameters we want to include in the gridsearch
params = [{'sgdclassifier__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
           'sgdclassifier__loss': ['perceptron', 'hinge', 'log'],
           'sgdclassifier__penalty': ['l2', 'l1', 'elasticnet']}] 

In [None]:
# Setting up a gridsearch 
gs2 = GridSearchCV(estimator=pipe2, param_grid=params, 
                   scoring='accuracy', cv=10, n_jobs=-1, refit=True)
# Fit the data to the pipeline with gridsearch
%time gs2.fit(X_train, y_train)

In [None]:
gs2.best_params_, gs2.best_score_

In [None]:

plot_confusion_matrix(gs2, X_test, y_test, labels=gs2.classes_)
plt.show()

In [None]:
# The evaluation metrics in a more detailed way. 
y_test_pred = gs2.predict(X_test)
print(classification_report(y_test, y_test_pred))