In [None]:
import pandas as pd
import numpy as np
import nltk 
import re
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns 
import sklearn.metrics
import sklearn
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
data=pd.read_csv("../input/source-based-news-classification/news_articles.csv")

In [None]:
data.info()

In [None]:
data.head()

# EDA

In [None]:
data.groupby('label').describe()

In [None]:
sns.heatmap(data.isnull(),yticklabels=False, cbar=False,cmap='magma')

In [None]:
data=data.dropna()

In [None]:
data.head()

Generate length of title and length of text features to see for any trends there

In [None]:
data['text_len'] = data['text'].apply(len)

In [None]:
data['len_title'] = data['title'].apply(len)

In [None]:
!pip install autoviz
from autoviz.AutoViz_Class import AutoViz_Class

In [None]:
AV = AutoViz_Class()

In [None]:
df = AV.AutoViz(filename="",sep=',', depVar='label', dfte=data, header=0, verbose=2, 
                 lowess=False, chart_format='svg', max_rows_analyzed=150000, max_cols_analyzed=30)

In [None]:
# Plot article type distribution
df_type = data['type'].value_counts()
sns.barplot(np.arange(len(df_type)), df_type)
plt.xticks(np.arange(len(df_type)), df_type.index.values.tolist(), rotation=90)
plt.title('Article type count', fontsize=20)
plt.show()

Text Preprocessing and Bag of Words for feature engineering

In [None]:
from nltk.corpus import stopwords
import string
stopwords.words('english')[0:10] # Show some stop words

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Perform lemmatization
    4. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    lemma = nlp.WordNetLemmatizer()
    nopunc = [ lemma.lemmatize(word) for word in nopunc]

In [None]:
data['title'].apply(text_process)

In [None]:
data['text'].head(5).apply(text_process)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=text_process).fit(data['text'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

In [None]:
messages_bow = bow_transformer.transform(data['text'])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)

messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(data.label)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, label_train, label_test = train_test_split(data['text'], y, test_size=0.2, random_state = 42)

print(len(X_train), len(X_test), len(X_train) + len(X_test))

# CLASSIFICATION MODELS :

# 1.Naive Bayes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, data['label'])

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
pipeline.fit(X_train,label_train)

In [None]:
predictions1 = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(predictions1,label_test))

# 2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=50, criterion='entropy',random_state=0)
classifier.fit(messages_tfidf, data['label'])

In [None]:
pipeline_rf = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', RandomForestClassifier()),  # train on TF-IDF vectors w/ SVM
])

In [None]:
pipeline_rf.fit(X_train,label_train)

In [None]:
predictions2 = pipeline_rf.predict(X_test)

In [None]:
print(classification_report(predictions2,label_test))

# 3. Logistic Model

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(messages_tfidf, data['label'])

In [None]:
pipeline_lr = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ SVM
])

In [None]:
pipeline_lr.fit(X_train,label_train)

In [None]:
predictions3 = pipeline_lr.predict(X_test)

In [None]:
print(classification_report(predictions3,label_test))

# 4. Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier
nn=MLPClassifier(random_state=1)

In [None]:
pipeline_nn = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MLPClassifier()),  # train on TF-IDF vectors w/ SVM
])

In [None]:
pipeline_nn.fit(X_train,label_train)

In [None]:
predictions4 = pipeline_nn.predict(X_test)

In [None]:
print(classification_report(predictions4,label_test))

# 5. Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42, criterion="entropy",
                             min_samples_split=10, min_samples_leaf=10, max_depth=3, max_leaf_nodes=5)

In [None]:
pipeline_dt = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', DecisionTreeClassifier()),  # train on TF-IDF vectors w/ SVM
])

In [None]:
pipeline_dt.fit(X_train,label_train)

In [None]:
predictions_dt = pipeline_dt.predict(X_test)

In [None]:
print(classification_report(predictions_dt,label_test))

# 6. Gradient Boosting 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbm=GradientBoostingClassifier(learning_rate=0.3,max_depth=4,n_estimators=100 ,random_state=0)

In [None]:
pipeline_gbm = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', GradientBoostingClassifier()),  # train on TF-IDF vectors w/ SVM
])

In [None]:
pipeline_gbm.fit(X_train,label_train)

In [None]:
predictions_gbm = pipeline_gbm.predict(X_test)

In [None]:
print(classification_report(predictions_gbm,label_test))

In terms of accuracy, best model turns out to be Random Forest with 0.81 accuracy followed by GBM.
In terms of Precision, no model can beat Naive Bayes Classifier.